Purpose of this file

This file has all of the code for the the main analysis to reproduce all figures except Figure S6 in the Schmidt et al. manuscript entitled “Microhabitats shape diversity-productivity relationships in freshwater bacterial communities”. To see how to reproduce figure S6 please see the github for this project and go to analysis/OTU_Removal_Analysis.html on the github page.

If you have any questions, please be welcome to email the corresponding author at or tweet her at micro_marian.

Load Libraries

library(devtools)   # Reproducibility (see end of file)
library(phyloseq)   # Easier data manipulation  
library(picante)    # Phylogenetic Tree Analysis   
library(tidyverse)  # Pretty plotting and data manipulation 
library(forcats)    # Recoding factors
library(cowplot)    # Multiple plotting
library(picante)    # Will also include ape and vegan  
library(car)        # Residual analysis 
library(sandwich)   # vcovHC function in post-hoc test 
library(MASS)       # studres in plot_residuals function    
library(caret)      # Cross validation
library(pander)     # Pretty tables      
library(glmnet)     # Lasso regressions 
library(broom)      # Stats from lm regression 
library(purrr)      # Exporting lm results  
library(DT)         # Fancy HTML table output
library(lme4)       # linear mixed effects model
source("code/Muskegon_functions.R")   # Source custom functions  
source("code/set_colors.R")           # Set Colors for plotting

Load data

# Loads a phyloseq object named otu_merged_musk_pruned)
load("data/surface_PAFL_otu_pruned_raw.RData") 
# The name of the phyloseq object is: 
surface_PAFL_otu_pruned_raw 
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 7806 taxa and 24 samples ]
## sample_data() Sample Data:       [ 24 samples by 44 sample variables ]
## tax_table()   Taxonomy Table:    [ 7806 taxa by 8 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 7806 tips and 7804 internal nodes ]
# Remove doubletons!
surface_PAFL_otu_pruned_rm2 <- prune_taxa(taxa_sums(surface_PAFL_otu_pruned_raw) > 2, surface_PAFL_otu_pruned_raw) 
surface_PAFL_otu_pruned_rm2
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2979 taxa and 24 samples ]
## sample_data() Sample Data:       [ 24 samples by 44 sample variables ]
## tax_table()   Taxonomy Table:    [ 2979 taxa by 8 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 2979 tips and 2977 internal nodes ]
# Remove tree for computational efficiency 
surface_PAFL_otu_pruned_notree_rm2 <- phyloseq(tax_table(surface_PAFL_otu_pruned_rm2), otu_table(surface_PAFL_otu_pruned_rm2), sample_data(surface_PAFL_otu_pruned_rm2)) 

# Gather the metadata in a dataframe
metadata <- data.frame(sample_data(surface_PAFL_otu_pruned_notree_rm2)) %>%
      mutate(fraction = factor(fraction, levels = c("WholePart","WholeFree")),
         lakesite = factor(lakesite,  levels = c("MOT", "MDP", "MBR", "MIN")),
         fraction = fct_recode(fraction, Particle = "WholePart", Free = "WholeFree"),
         lakesite = fct_recode(lakesite, Outlet = "MOT", Deep = "MDP", Bear = "MBR", River = "MIN"))
row.names(metadata) <- metadata$norep_filter_name

# Replace the sample data 
sample_data(surface_PAFL_otu_pruned_notree_rm2) <- metadata

# Create a dataframe for environmental data only
environmental_data <- metadata %>%
  dplyr::select(Temp_C:DO_percent, -BGA_cellspermL, -SRP_ugperL, fraction, norep_filter_name, -DO_percent) %>%
  dplyr::filter(fraction == "Free") %>%
  dplyr::select(-fraction) %>%
  tibble::column_to_rownames(var = "norep_filter_name") %>%
  dplyr::rename(Temp = Temp_C, SPC = SpCond_uSpercm,
         TDS = TDS_mgperL, ORP = ORP_mV,
         Chla = Chl_Lab_ugperL, Cl = Cl_mgperL,
         SO4 = SO4_mgperL, NO3 = NO3_mgperL,
         NH3 = NH3_mgperL, TKN = TKN_mgperL,
         TP = TP_ugperL, Alk = Alk_mgperL,
         DO = DO_mgperL) %>%
  as.matrix()

#### PCA DATA 
# Scale the data so their variances are the same!
scaled_enviro <- scale(environmental_data)

# Sanity Checks: check that we get mean of 0 and sd of 1
apply(scaled_enviro, 2, mean)
##          Temp           SPC           TDS            pH           ORP          Chla            Cl           SO4           NO3           NH3           TKN            TP           Alk            DO 
## -3.151262e-16 -1.170392e-15 -8.881784e-16 -2.063155e-15  4.614635e-18  2.081668e-17 -2.937375e-16  3.932311e-16  1.237526e-16 -1.155579e-17  7.400583e-17 -5.090894e-17  1.454855e-15  7.401939e-17
apply(scaled_enviro, 2, sd)
## Temp  SPC  TDS   pH  ORP Chla   Cl  SO4  NO3  NH3  TKN   TP  Alk   DO 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1
# Run a principal component analysis via the vegan package
pca_environ <- rda(scaled_enviro) 

# What is the proportion explained by each of the PCA axes? 
summary(pca_environ)$cont$importance
## Importance of components:
##                          PC1    PC2    PC3    PC4     PC5     PC6     PC7      PC8     PC9    PC10      PC11
## Eigenvalue            5.4670 4.2310 2.0103 0.9674 0.52881 0.34976 0.24025 0.119868 0.05488 0.02604 0.0047450
## Proportion Explained  0.3905 0.3022 0.1436 0.0691 0.03777 0.02498 0.01716 0.008562 0.00392 0.00186 0.0003389
## Cumulative Proportion 0.3905 0.6927 0.8363 0.9054 0.94317 0.96816 0.98532 0.993881 0.99780 0.99966 1.0000000
# Pull out all of the PCA scores into a dataframe
pca_scores_df <- summary(pca_environ)$sites %>%
  as.data.frame() %>%
  tibble::rownames_to_column(var = "norep_filter_name") %>%
  mutate(norep_water_name = paste(substr(norep_filter_name, 1, 4), substr(norep_filter_name, 6, 8), sep = "")) %>%
  dplyr::select(-c(norep_filter_name, PC3, PC4, PC5, PC6))

# Combine the above dataframe with the rest of the metadata
metadata_pca <- metadata %>%
  mutate(norep_water_name = paste(substr(norep_filter_name, 1, 4), substr(norep_filter_name, 6, 8), sep = "")) %>%
  left_join(pca_scores_df, by = "norep_water_name")

Removing the OTUs that are doubletons across the entire dataset leaves a total of 2979 OTUs, a decrease from 7806. This will help decrease the computational load for phylogenetic analysis and also further remove potential sequencing artifacts.

Calculate Community Diversity

# Set the seed for reproducibility
set.seed(777)

# Calculate the alpha diversity with 100 repsampling events
alpha_calc <- calc_alpha_diversity(physeq = surface_PAFL_otu_pruned_notree_rm2)

# What was the minimum sample size? 
min(sample_sums(surface_PAFL_otu_pruned_notree_rm2)) - 1
## [1] 6489
# Put it altogether in a dataframe 
otu_alphadiv <- calc_mean_alphadiv(physeq = surface_PAFL_otu_pruned_notree_rm2,
                           richness_df = alpha_calc$Richness, 
                           exp_shannon_df = alpha_calc$Shannon,
                           simpson_df = alpha_calc$Inverse_Simpson) %>%
  mutate(fraction = factor(fraction, levels = c("Particle","Free")),
         lakesite = factor(lakesite,  levels = c("Outlet", "Deep", "Bear", "River")),
         measure = factor(measure, levels = c("Richness",  "Exponential_Shannon", "Inverse_Simpson", "Simpsons_Evenness")),
         norep_water_name = paste(substr(norep_filter_name, 1, 4), substr(norep_filter_name, 6, 8), sep = ""))

otu_alphadiv <- otu_alphadiv %>%
  left_join(pca_scores_df, by = "norep_water_name")

# Make subsetted dataframes for each diversity metric
richness <- filter(otu_alphadiv, measure == "Richness")
shannon <- filter(otu_alphadiv, measure == "Exponential_Shannon")
invsimps <- filter(otu_alphadiv, measure == "Inverse_Simpson")
simpseven <- filter(otu_alphadiv, measure == "Simpsons_Evenness") 

Calculate the Phylogenetic Diversity

# Read in the tree 
RAREFIED_rm2_fasttree <- read.tree(file = "data/PhyloTree/newick_tree_rm2_rmN.tre")
  
# Load in data that has doubletons removed 
load("data/PhyloTree/surface_PAFL_otu_pruned_RAREFIED_rm2.RData")
surface_PAFL_otu_pruned_RAREFIED_rm2
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 1891 taxa and 24 samples ]
## sample_data() Sample Data:       [ 24 samples by 70 sample variables ]
## tax_table()   Taxonomy Table:    [ 1891 taxa by 8 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 1891 tips and 1889 internal nodes ]
# Create the OTU table for picante 
surface_PAFL_RAREFIED_rm2_otu <- matrix(otu_table(surface_PAFL_otu_pruned_RAREFIED_rm2), nrow = nrow(otu_table(surface_PAFL_otu_pruned_RAREFIED_rm2)))
rownames(surface_PAFL_RAREFIED_rm2_otu) <- sample_names(surface_PAFL_otu_pruned_RAREFIED_rm2)
colnames(surface_PAFL_RAREFIED_rm2_otu) <- taxa_names(surface_PAFL_otu_pruned_RAREFIED_rm2)
    
  
## Calculate input for SES_MPD  
# Convert the abundance data to standardized abundanced vegan function `decostand' , NOTE: method = "total"
otu_decostand_total <- decostand(surface_PAFL_RAREFIED_rm2_otu, method = "total")
# check total abundance in each sample
apply(otu_decostand_total, 1, sum)
## MBREJ515 MBREJ715 MBREJ915 MBREK515 MBREK715 MBREK915 MDPEJ515 MDPEJ715 MDPEJ915 MDPEK515 MDPEK715 MDPEK915 MINEJ515 MINEJ715 MINEJ915 MINEK515 MINEK715 MINEK915 MOTEJ515 MOTEJ715 MOTEJ915 MOTEK515 MOTEK715 MOTEK915 
##        1        1        1        1        1        1        1        1        1        1        1        1        1        1        1        1        1        1        1        1        1        1        1        1
# check for mismatches/missing species between community data and phylo tree
RAREFIED_rm2_matches <- match.phylo.comm(RAREFIED_rm2_fasttree, otu_decostand_total)
# the resulting object is a list with $phy and $comm elements.  replace our
# original data with the sorted/matched data
phy_RAREFIED_rm2 <- RAREFIED_rm2_matches$phy
comm_RAREFIED_rm2 <- RAREFIED_rm2_matches$comm

# Calculate the phylogenetic distances
phy_dist_RAREFIED_rm2 <- cophenetic(phy_RAREFIED_rm2)

  
## Calculate SES_MPD
###################################### INDEPENDENT SWAP ############################################
# calculate standardized effect size mean pairwise distance (ses.mpd)
unweighted_sesMPD_indepswap_RAREFIED_rm2 <- ses.mpd(comm_RAREFIED_rm2, phy_dist_RAREFIED_rm2, null.model = "independentswap", 
                                     abundance.weighted = FALSE, runs = 999)

WEIGHTED_sesMPD_indepswap_RAREFIED_rm2 <- ses.mpd(comm_RAREFIED_rm2, phy_dist_RAREFIED_rm2, null.model = "independentswap", 
                                     abundance.weighted = TRUE, runs = 999)


# Gather div info
rich_df <- filter(otu_alphadiv, measure == "Richness") %>%
  dplyr::select(norep_filter_name, mean, sd, frac_bacprod, SD_frac_bacprod, fracprod_per_cell_noinf)

invsimps_df <- filter(otu_alphadiv, measure == "Inverse_Simpson") %>%
  dplyr::select(norep_filter_name, mean, sd, frac_bacprod, SD_frac_bacprod, fracprod_per_cell_noinf)


# Prepare to be merged with each other 
unweighted_df <- unweighted_sesMPD_indepswap_RAREFIED_rm2 %>%
  rownames_to_column("names") %>%
  mutate(phylo_measure = "Unweighted_SESMPD") %>%
  make_metadata_norep() %>%
  mutate(fraction = fct_recode(fraction, Particle = "WholePart", Free = "WholeFree")) %>%
  rename(norep_filter_name = names) %>%
  left_join(rich_df, by = "norep_filter_name") %>%
  mutate(lakesite = factor(lakesite, levels = c("MOT", "MDP","MBR", "MIN")),
         lakesite = fct_recode(lakesite, Outlet = "MOT", Deep = "MDP", Bear = "MBR", River = "MIN"))
  
weighted_df <- WEIGHTED_sesMPD_indepswap_RAREFIED_rm2 %>%
  rownames_to_column("names") %>%
  mutate(phylo_measure = "Weighted_SESMPD") %>%
  make_metadata_norep() %>%
  mutate(fraction = fct_recode(fraction, Particle = "WholePart", Free = "WholeFree")) %>%
  rename(norep_filter_name = names) %>%
  left_join(invsimps_df, by = "norep_filter_name") %>%
  mutate(lakesite = factor(lakesite, levels = c("MOT", "MDP","MBR", "MIN")),
         lakesite = fct_recode(lakesite, Outlet = "MOT", Deep = "MDP", Bear = "MBR", River = "MIN"))

Main Figures

Have a legend for all plots

Since we have set a parameter in our set_colors.R file that we source at the beginning and use this throughout, we can assure that the below legends will represent the legends called afterwards.

####### Legend for lakesite
legend_plot <- ggplot(metadata, aes(y = frac_bacprod, x = fraction, fill = fraction, shape = season)) + 
  geom_jitter(size = 3, width = 0.2) + 
  scale_fill_manual(values = fraction_colors, guide = guide_legend(override.aes = list(shape = 22, size = 4))) +
  scale_shape_manual(values = season_shapes, guide = guide_legend(override.aes = list(size = 4))) +
  theme(legend.position = "bottom", axis.title.x = element_blank(),
        legend.title = element_blank())
# Extract the legend
season_legend <- get_legend(legend_plot)

####### Legend for lakesite
lakesite_legend <- ggplot(metadata, aes(y = frac_bacprod, x = fraction, fill = fraction, shape = lakesite)) + 
  geom_jitter(size = 3, width = 0.2) + 
  scale_fill_manual(values = fraction_colors, guide = guide_legend(override.aes = list(shape = 22, size = 4))) +
  scale_shape_manual(values = lakesite_shapes, guide = guide_legend(override.aes = list(size = 4))) +
  theme(legend.position = "bottom", axis.title.x = element_blank(),
        legend.title = element_blank())
# Extract the legend
lakesite_legend <-  get_legend(lakesite_legend)

Figure 1

######################################################### Fraction ABUNDANCe 
frac_abund_wilcox <- wilcox.test(log10(as.numeric(fraction_bac_abund)) ~ fraction, data = metadata)
frac_abund_wilcox
## 
##  Wilcoxon rank sum test
## 
## data:  log10(as.numeric(fraction_bac_abund)) by fraction
## W = 0, p-value = 1.479e-06
## alternative hypothesis: true location shift is not equal to 0
metadata %>%
  group_by(fraction) %>%
  summarize(mean(as.numeric(fraction_bac_abund)))
## # A tibble: 2 x 2
##   fraction `mean(as.numeric(fraction_bac_abund))`
##   <fct>                                     <dbl>
## 1 Particle                                 41169.
## 2 Free                                    734522.
# Make a data frame to draw significance line in boxplot (visually calculated)
dat1 <- data.frame(a = c(1.1,1.1,1.9,1.9), b = c(6.45,6.5,6.5,6.45)) # WholePart vs WholeFree

poster_a <- 
  ggplot(filter(metadata, norep_filter_name != "MOTEJ515"), 
       aes(y = log10(as.numeric(fraction_bac_abund)), x = fraction)) +
  geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes( fill = fraction)) +
  ylab("Log10(Bacterial Counts) \n (cells/mL)") +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  ##### Particle vs free cell abundances 
  geom_path(data = dat1, aes(x = a, y = b), linetype = 1, color = "gray40") +
  annotate("text", x=1.5, y=6.55, size = 8, color = "gray40", label= paste("***")) +
  annotate("text", x=1.5, y=6.4, size = 3.5, color = "gray40",
           label= paste("p =", round(frac_abund_wilcox$p.value, digits = 6))) +
  theme(legend.position = "bottom",
        legend.title = element_blank(),
        axis.title.x = element_blank())



######################################################### TOTAL PRODUCTION 
totprod_wilcox <- wilcox.test(frac_bacprod ~ fraction, data = metadata)
totprod_wilcox
## 
##  Wilcoxon rank sum test
## 
## data:  frac_bacprod by fraction
## W = 33, p-value = 0.02418
## alternative hypothesis: true location shift is not equal to 0
metadata %>%
  group_by(fraction) %>%
  summarize(mean(frac_bacprod))
## # A tibble: 2 x 2
##   fraction `mean(frac_bacprod)`
##   <fct>                   <dbl>
## 1 Particle                 9.96
## 2 Free                    24.1
# Make a data frame to draw significance line in boxplot (visually calculated)
dat2 <- data.frame(a = c(1.1,1.1,1.9,1.9), b = c(71,72,72,71)) # WholePart vs WholeFree


poster_b <- ggplot(metadata, aes(y = frac_bacprod, x = fraction)) + 
  geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes( fill = fraction)) +
  scale_fill_manual(values = fraction_colors, guide = FALSE) +
  scale_shape_manual(values = season_shapes) +
  scale_y_continuous(limits = c(0, 78), expand = c(0,0), breaks = seq(0, 75, by = 15)) +
  ylab("Community Production \n (μgC/L/day)") +
  ##### Particle vs free bulk production  
  geom_path(data = dat2, aes(x = a, y = b), linetype = 1, color = "gray40") +
  annotate("text", x=1.5, y=73, size = 8, color = "gray40", label= paste("*")) +
  annotate("text", x=1.5, y=69, size = 3.5, color = "gray40",
           label= paste("p =", round(totprod_wilcox$p.value, digits = 3))) +
  theme(legend.position = "none",
        axis.title.x = element_blank())


######################################################### TOTAL PRODUCTION 
percellprod_wilcox <- wilcox.test(log10(fracprod_per_cell) ~ fraction, data = filter(metadata, norep_filter_name != "MOTEJ515"))
percellprod_wilcox
## 
##  Wilcoxon rank sum test
## 
## data:  log10(fracprod_per_cell) by fraction
## W = 125, p-value = 6.656e-05
## alternative hypothesis: true location shift is not equal to 0
filter(metadata, norep_filter_name != "MOTEJ515") %>%
  group_by(fraction) %>%
  summarize(mean(fracprod_per_cell))
## # A tibble: 2 x 2
##   fraction `mean(fracprod_per_cell)`
##   <fct>                        <dbl>
## 1 Particle              0.000000482 
## 2 Free                  0.0000000387
# Make a data frame to draw significance line in boxplot (visually calculated)
dat3 <- data.frame(a = c(1.1,1.1,1.9,1.9), b = c(-5.05,-5,-5,-5.05)) # WholePart vs WholeFree


poster_c <- ggplot(filter(metadata, norep_filter_name != "MOTEJ515"), 
       aes(y = log10(fracprod_per_cell), x = fraction)) +
  geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes( fill = fraction)) +
  scale_fill_manual(values = fraction_colors, guide = FALSE) +
  scale_shape_manual(values = season_shapes) +
  ylim(c(-8.5, -4.9)) + 
  ylab("Log10(Per-Capita Production) \n(μgC/cell/day)") +
  ##### Particle vs free per-cell production 
  geom_path(data = dat3, aes(x = a, y = b), linetype = 1, color = "gray40") +
  annotate("text", x=1.5, y=-4.95, size = 8, color = "gray40", label= paste("***")) +
  annotate("text", x=1.5, y=-5.15,  size = 3.5, color = "gray40",
           label= paste("p =", round(percellprod_wilcox$p.value, digits = 5))) +
  theme(legend.position = "none",
        axis.title.x = element_blank())


######## FIGURE 1
# legend
legend <- get_legend(poster_a)

row1_plots <- plot_grid(poster_a + theme(legend.position = "none"), poster_b, poster_c,
          labels = c("A", "B", "C"),
          ncol = 3, nrow = 1)

fig_1 <- plot_grid(row1_plots, season_legend,
           ncol = 1, nrow = 2, 
           rel_heights = c(1, 0.05)); fig_1

## Change the theme to be minimal
prezzzz <- plot_grid(poster_a + theme_minimal() + theme(legend.position = "none"), 
          poster_b + theme_minimal() + theme(legend.position = "none"),  
          poster_c + theme_minimal() + theme(legend.position = "none"), 
          labels = c("A", "B", "C"),
          ncol = 3, nrow = 1)

fig_1_minimal <- plot_grid(prezzzz, season_legend,
           ncol = 1, nrow = 2, 
           rel_heights = c(1, 0.05)); fig_1_minimal

Linear Regressions with Fraction Diversity

# Free-Living samples only 
div_measures_free <- otu_alphadiv %>%
  dplyr::select(fraction, mean, measure, frac_bacprod, fracprod_per_cell_noinf) %>%
  filter(fraction == "Free") %>%
  dplyr::select(-fraction)
# Particle-associated samples only 
div_measures_part <- otu_alphadiv %>%
  dplyr::select(fraction, mean, measure, frac_bacprod, fracprod_per_cell_noinf) %>%
  filter(fraction == "Particle") %>%
  dplyr::select(-fraction)

## All of the samples!
div_measures_all <- otu_alphadiv %>%
  dplyr::select(mean, measure, frac_bacprod, fracprod_per_cell_noinf)

########################################################
##############  Particle: Community-wide 
lm_divs_particle_comm <- div_measures_part %>%
  dplyr::select(-fracprod_per_cell_noinf) %>%
  group_by(measure) %>% 
  do(glance(lm(frac_bacprod ~ mean, data = .))) %>% 
  mutate(fraction = "Particle", dependent = "Community Production") %>%
  dplyr::select(fraction, dependent, measure, AIC, adj.r.squared, p.value) %>%
  ungroup() %>%
  mutate(FDR.p = p.adjust(p.value, method = "fdr")) 

##############  Particle: Community-wide 
lm_divs_particle_percap <- div_measures_part %>%
  dplyr::select(-frac_bacprod) %>%
  group_by(measure) %>% 
  do(glance(lm(log10(fracprod_per_cell_noinf) ~ mean, data = .))) %>% 
  mutate(fraction = "Particle", dependent = "Per-Capita Production") %>%
  dplyr::select(fraction, dependent, measure, AIC, adj.r.squared, p.value) %>%
  ungroup() %>%
  mutate(FDR.p = p.adjust(p.value, method = "fdr")) 

########################################################
##############  Free: Community-wide 
lm_divs_free_comm <- div_measures_free %>%
  dplyr::select(-fracprod_per_cell_noinf) %>%
  group_by(measure) %>% 
  do(glance(lm(frac_bacprod ~ mean, data = .))) %>% 
  mutate(fraction = "Free", dependent = "Community Production") %>%
  dplyr::select(fraction, dependent, measure, AIC, adj.r.squared, p.value) %>%
  ungroup() %>%
  mutate(FDR.p = p.adjust(p.value, method = "fdr")) 

##############  Free: Community-wide 
lm_divs_free_percap <- div_measures_free %>%
  dplyr::select(-frac_bacprod) %>%
  group_by(measure) %>% 
  do(glance(lm(log10(fracprod_per_cell_noinf) ~ mean, data = .))) %>% 
  mutate(fraction = "Free", dependent = "Per-Capita Production") %>%
  dplyr::select(fraction, dependent, measure, AIC, adj.r.squared, p.value) %>%
  ungroup() %>%
  mutate(FDR.p = p.adjust(p.value, method = "fdr")) 

########################################################
##############  All Samples: Community-wide 
lm_divs_all_comm <- div_measures_all %>%
  dplyr::select(-fracprod_per_cell_noinf) %>%
  group_by(measure) %>% 
  do(glance(lm(frac_bacprod ~ mean, data = .))) %>% 
  mutate(fraction = "All Samples", dependent = "Community Production") %>%
  dplyr::select(fraction, dependent, measure, AIC, adj.r.squared, p.value) %>%
  ungroup() %>%
  mutate(FDR.p = p.adjust(p.value, method = "fdr")) 

##############  All Samples: Community-wide 
lm_divs_all_percap <- div_measures_all %>%
  dplyr::select(-frac_bacprod) %>%
  group_by(measure) %>% 
  do(glance(lm(log10(fracprod_per_cell_noinf) ~ mean, data = .))) %>% 
  mutate(fraction = "All Samples", dependent = "Per-Capita Production") %>%
  dplyr::select(fraction, dependent, measure, AIC, adj.r.squared, p.value) %>%
  ungroup() %>%
  mutate(FDR.p = p.adjust(p.value, method = "fdr")) 


## Filtered PCA Linear Model Results
# Put all of the PCA and environmental  linear model results together into one dataframe 
lm_div_results <- 
  bind_rows(lm_divs_particle_comm, lm_divs_particle_percap,
          lm_divs_free_comm, lm_divs_free_percap,     
          lm_divs_all_comm, lm_divs_all_percap) %>% 
  dplyr::rename(independent_var = measure,
                dependent_var = dependent) 

See supplemental table 1 header for the table output!

Cross Validation

Cross validation will provide a better estimate for the adjusted R-squared value of the diversity regressions.

#################################### Community-Wide Production #################################### 
################### Richness ################### 
lm_prod_vs_rich_PA <- lm(frac_bacprod ~ mean, data = filter(richness, fraction == "Particle"))

summary(lm(frac_bacprod ~ mean, data = filter(richness, fraction == "Particle")))
## 
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(richness, fraction == 
##     "Particle"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.1446 -2.8561 -0.4608  3.7354  9.9395 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -11.167033   5.731339  -1.948  0.07996 . 
## mean          0.037941   0.009891   3.836  0.00329 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.499 on 10 degrees of freedom
## Multiple R-squared:  0.5954, Adjusted R-squared:  0.5549 
## F-statistic: 14.72 on 1 and 10 DF,  p-value: 0.003286
summary(lm(frac_bacprod ~ mean, data = filter(richness, fraction == "Free")))
## 
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(richness, fraction == 
##     "Free"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -22.013 -11.090  -2.717   8.548  33.314 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  5.32029   21.71997   0.245    0.811
## mean         0.05540    0.06242   0.888    0.396
## 
## Residual standard error: 17.7 on 10 degrees of freedom
## Multiple R-squared:  0.07303,    Adjusted R-squared:  -0.01966 
## F-statistic: 0.7879 on 1 and 10 DF,  p-value: 0.3956
# Without the 2 high samples 
summary(lm(frac_bacprod ~ mean, data = filter(richness, fraction == "Particle" & mean < 700)))
## 
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(richness, fraction == 
##     "Particle" & mean < 700))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.7760 -1.6847 -0.6523  2.8624  6.3591 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.43520    8.53971  -0.636    0.542
## mean         0.02559    0.01708   1.498    0.172
## 
## Residual standard error: 4.365 on 8 degrees of freedom
## Multiple R-squared:  0.2191, Adjusted R-squared:  0.1215 
## F-statistic: 2.244 on 1 and 8 DF,  p-value: 0.1725
rich_test_df <- 
  richness %>%
  mutate(frac_boolean = (fraction == "Particle")*1) 

# Correlation of richness between PA and FL
rich_test_df_sorted <- arrange(rich_test_df, norep_water_name)
rich_test_df_sorted_PA <- filter(rich_test_df_sorted, fraction == "Particle")
rich_test_df_sorted_FL <- filter(rich_test_df_sorted, fraction == "Free")
cor.test(rich_test_df_sorted_PA$mean, rich_test_df_sorted_FL$mean)
## 
##  Pearson's product-moment correlation
## 
## data:  rich_test_df_sorted_PA$mean and rich_test_df_sorted_FL$mean
## t = 0.75172, df = 10, p-value = 0.4695
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3950677  0.7108262
## sample estimates:
##       cor 
## 0.2312697
# Look into car::vif() for multicollinearity 
cor(rich_test_df$mean, rich_test_df$frac_boolean)
## [1] 0.6510939
summary(lm(frac_bacprod ~ mean + frac_boolean, data = rich_test_df))
## 
## Call:
## lm(formula = frac_bacprod ~ mean + frac_boolean, data = rich_test_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.449  -5.824  -1.118   5.647  34.871 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   10.00701    7.87588   1.271  0.21777   
## mean           0.04155    0.02055   2.021  0.05616 . 
## frac_boolean -23.18168    6.90002  -3.360  0.00297 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.83 on 21 degrees of freedom
## Multiple R-squared:  0.3506, Adjusted R-squared:  0.2887 
## F-statistic: 5.668 on 2 and 21 DF,  p-value: 0.01076
PA_rich_residplot <- plot_residuals(lm_prod_vs_rich_PA, filter(richness, fraction == "Particle")$frac_bacprod, "Community Production vs Particle-Associated Richness")
## [1] "There are no high leverage points in this model."

# Cross validate particle-associated for a better estimate of the adjusted R-squared
cv_lm_prod_vs_rich_PA <- train(
      frac_bacprod ~ mean, data = filter(richness, fraction == "Particle"), 
      method ='lm', 
      trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100), 
      tuneGrid = expand.grid(intercept = TRUE))
cv_lm_prod_vs_rich_PA$results   # Particle Community-Wide CV results 
##   intercept     RMSE  Rsquared      MAE  RMSESD RsquaredSD    MAESD
## 1      TRUE 6.761303 0.6544213 5.382886 2.14011  0.2954728 1.736207
################### Shannon Entropy ################### 
lm_prod_vs_shannon_PA <- lm(frac_bacprod ~ mean, data = filter(shannon, fraction == "Particle"))
cv_lm_prod_vs_shannon_PA <- train(
      frac_bacprod ~ mean, data = filter(shannon, fraction == "Particle"), 
      method ='lm', 
      trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100), 
      tuneGrid = expand.grid(intercept = TRUE))
cv_lm_prod_vs_shannon_PA$results   # Particle Community-Wide CV results 
##   intercept     RMSE  Rsquared      MAE   RMSESD RsquaredSD    MAESD
## 1      TRUE 7.759682 0.7236966 5.788561 3.884461  0.2689809 2.736891
################### Inverse Simpson ################### 
lm_prod_vs_invsimps_PA <- lm(frac_bacprod ~ mean, data = filter(invsimps, fraction == "Particle"))
summary(lm_prod_vs_invsimps_PA)
## 
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(invsimps, fraction == 
##     "Particle"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.5101 -2.0966 -0.1789  0.8421  7.8801 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.42551    2.43457  -0.175 0.864742    
## mean         0.29241    0.05761   5.076 0.000481 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.572 on 10 degrees of freedom
## Multiple R-squared:  0.7204, Adjusted R-squared:  0.6924 
## F-statistic: 25.76 on 1 and 10 DF,  p-value: 0.0004809
# Without the 2 high samples 
summary(lm(frac_bacprod ~ mean, data = filter(invsimps, fraction == "Particle" & mean < 70)))
## 
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(invsimps, fraction == 
##     "Particle" & mean < 70))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.2949 -1.6336 -0.3733  1.4762  6.5363 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   1.6287     2.4758   0.658   0.5291  
## mean          0.2050     0.0806   2.543   0.0345 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.672 on 8 degrees of freedom
## Multiple R-squared:  0.4471, Adjusted R-squared:  0.378 
## F-statistic: 6.469 on 1 and 8 DF,  p-value: 0.03452
PA_invsimps_residplot <- plot_residuals(lm_prod_vs_invsimps_PA, filter(invsimps, fraction == "Particle")$frac_bacprod, "Community Production vs Particle-Associated Inverse Simpson")
## [1] "There are no high leverage points in this model."

invsimps_test_df <- 
  invsimps %>%
  mutate(frac_boolean = (fraction == "Particle")*1) 

cor(invsimps_test_df$mean, invsimps_test_df$frac_boolean)
## [1] 0.3171016
summary(lm(frac_bacprod ~ mean + frac_boolean, data = invsimps_test_df))
## 
## Call:
## lm(formula = frac_bacprod ~ mean + frac_boolean, data = invsimps_test_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.336  -5.979  -0.144   2.625  37.405 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   16.3727     5.1919   3.154  0.00479 **
## mean           0.3193     0.1521   2.099  0.04813 * 
## frac_boolean -17.7516     5.4875  -3.235  0.00397 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.75 on 21 degrees of freedom
## Multiple R-squared:  0.3587, Adjusted R-squared:  0.2976 
## F-statistic: 5.872 on 2 and 21 DF,  p-value: 0.009428
# Correlation of inverse Simpson's index between PA and FL
invsimps_test_df_sorted <- arrange(invsimps_test_df, norep_water_name)
invsimps_test_df_sorted_PA <- filter(invsimps_test_df_sorted, fraction == "Particle")
invsimps_test_df_sorted_FL <- filter(invsimps_test_df_sorted, fraction == "Free")
cor.test(invsimps_test_df_sorted_PA$mean, invsimps_test_df_sorted_FL$mean)
## 
##  Pearson's product-moment correlation
## 
## data:  invsimps_test_df_sorted_PA$mean and invsimps_test_df_sorted_FL$mean
## t = 1.7221, df = 10, p-value = 0.1158
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1318285  0.8255638
## sample estimates:
##       cor 
## 0.4782564
cv_lm_prod_vs_invsimps_PA <- train(
      frac_bacprod ~ mean, data = filter(invsimps, fraction == "Particle"), 
      method ='lm', 
      trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100), 
      tuneGrid = expand.grid(intercept = TRUE))
cv_lm_prod_vs_invsimps_PA$results       # Particle Community-Wide CV results
##   intercept     RMSE  Rsquared      MAE   RMSESD RsquaredSD   MAESD
## 1      TRUE 5.153135 0.7560751 3.868983 2.056956  0.2676419 1.70888
################### Simpson's Evenness ################### 
lm_prod_vs_simpseven_PA <- lm(frac_bacprod ~ mean, data = filter(simpseven, fraction == "Particle"))
cv_lm_prod_vs_simpseven_PA <- train(
      frac_bacprod ~ mean, data = filter(simpseven, fraction == "Particle"), 
      method ='lm', 
      trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100), 
      tuneGrid = expand.grid(intercept = TRUE))
cv_lm_prod_vs_simpseven_PA$results   # Particle Community-Wide CV results 
##   intercept     RMSE  Rsquared      MAE   RMSESD RsquaredSD    MAESD
## 1      TRUE 6.210928 0.6561583 4.828706 3.029085  0.3026097 2.361936
#################################### Per-Capita Production #################################### 
################### Richness ################### 
lm_percell_prod_vs_rich_PA <- lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness, fraction == "Particle"))
summary(lm_percell_prod_vs_rich_PA)
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness, 
##     fraction == "Particle"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5548 -0.2142 -0.0078  0.1246  0.5942 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.055248   0.372497 -21.625 4.55e-09 ***
## mean         0.002368   0.000636   3.723  0.00475 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.352 on 9 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6064, Adjusted R-squared:  0.5626 
## F-statistic: 13.86 on 1 and 9 DF,  p-value: 0.004746
# Dummy variable regression
summary(lm(log10(fracprod_per_cell_noinf) ~ mean + frac_boolean, 
           data = rich_test_df))
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean + frac_boolean, 
##     data = rich_test_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.73738 -0.19725 -0.01448  0.24657  0.63488 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -8.3263959  0.2235397 -37.248  < 2e-16 ***
## mean          0.0022216  0.0005838   3.805  0.00111 ** 
## frac_boolean  0.3534155  0.1998678   1.768  0.09227 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3631 on 20 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6964, Adjusted R-squared:  0.6661 
## F-statistic: 22.94 on 2 and 20 DF,  p-value: 6.651e-06
PA_rich_percell_residplot <- plot_residuals(lm_percell_prod_vs_rich_PA, filter(richness, fraction == "Particle")$lm(log10(fracprod_per_cell_noinf)), "Log10(Cellular Production) vs Particle-Associated Richness")
## [1] "There are no high leverage points in this model."
## Error in xy.coords(x, y, xlabel, ylabel, log): attempt to apply non-function
cv_lm_percell_prod_vs_rich_PA <- train(
      log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness, fraction == "Particle" & norep_filter_name != "MOTEJ515"), 
      method ='lm', 
      trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100), 
      tuneGrid = expand.grid(intercept = TRUE))
cv_lm_percell_prod_vs_rich_PA$results      # Particle Per-Capita CV results 
##   intercept      RMSE  Rsquared       MAE    RMSESD RsquaredSD    MAESD
## 1      TRUE 0.4135688 0.6353932 0.3319425 0.1430362  0.3182312 0.127428
################### Shannon Entropy ################### 
lm_percell_prod_vs_shannon_PA <- lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(shannon, fraction == "Particle"))
cv_lm_percell_prod_vs_shannon_PA <- train(
      log10(fracprod_per_cell_noinf) ~ mean, data = filter(shannon, fraction == "Particle" & norep_filter_name != "MOTEJ515"), 
      method ='lm', 
      trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100), 
      tuneGrid = expand.grid(intercept = TRUE))
cv_lm_percell_prod_vs_shannon_PA$results      # Particle Per-Capita CV results 
##   intercept      RMSE  Rsquared      MAE    RMSESD RsquaredSD     MAESD
## 1      TRUE 0.4698687 0.7031069 0.350353 0.1911322  0.3283053 0.1416735
################### Inverse Simpson ################### 
lm_percell_prod_vs_invsimps_PA <- lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps, fraction == "Particle"))

summary(lm_percell_prod_vs_invsimps_PA)
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps, 
##     fraction == "Particle"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.28933 -0.18289 -0.11341  0.07465  0.56333 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7.358363   0.159683  -46.08 5.34e-12 ***
## mean         0.018005   0.003758    4.79 0.000987 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2978 on 9 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.7183, Adjusted R-squared:  0.687 
## F-statistic: 22.95 on 1 and 9 DF,  p-value: 0.0009868
# Dummy variable regression
summary(lm(log10(fracprod_per_cell_noinf) ~ mean + frac_boolean, data = invsimps_test_df))
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean + frac_boolean, 
##     data = invsimps_test_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.67981 -0.17066 -0.00109  0.13124  0.63544 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -8.01185    0.13640 -58.739  < 2e-16 ***
## mean          0.01815    0.00400   4.537 0.000201 ***
## frac_boolean  0.64856    0.14654   4.426 0.000260 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3347 on 20 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.7421, Adjusted R-squared:  0.7163 
## F-statistic: 28.77 on 2 and 20 DF,  p-value: 1.304e-06
PA_invsimps_percell_residplot <- plot_residuals(lm_percell_prod_vs_invsimps_PA, filter(invsimps, fraction == "Particle")$lm(log10(fracprod_per_cell_noinf)), "Log10(Cellular Production) vs Particle-Associated Inverse Simpson")

## [1] "There are no high leverage points in this model."
## Error in xy.coords(x, y, xlabel, ylabel, log): attempt to apply non-function
cv_lm_percell_prod_vs_invsimps_PA <- train(
      log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps, fraction == "Particle" & norep_filter_name != "MOTEJ515"), 
      method ='lm', 
      trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100), 
      tuneGrid = expand.grid(intercept = TRUE))
cv_lm_percell_prod_vs_invsimps_PA$results      # Particle Per-Capita CV results 
##   intercept      RMSE  Rsquared       MAE    RMSESD RsquaredSD      MAESD
## 1      TRUE 0.3627277 0.7384826 0.2953794 0.1152849  0.3120322 0.09661281
################### Simpson's Evenness ################### 
lm_percell_prod_vs_simpseven_PA <- lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(simpseven, fraction == "Particle"))
cv_lm_percell_prod_vs_simpseven_PA <- train(
      log10(fracprod_per_cell_noinf) ~ mean, data = filter(simpseven, fraction == "Particle" & norep_filter_name != "MOTEJ515"), 
      method ='lm', 
      trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100), 
      tuneGrid = expand.grid(intercept = TRUE))
cv_lm_percell_prod_vs_simpseven_PA$results      # Particle Per-Capita CV results  
##   intercept      RMSE  Rsquared       MAE    RMSESD RsquaredSD     MAESD
## 1      TRUE 0.4361836 0.6603424 0.3503975 0.1419614  0.3057268 0.1133807

Diversity distributions

rich_fraction_wilcox <- wilcox.test(mean ~ fraction, data = richness)

# Make a data frame to draw significance line in boxplot (visually calculated)
rich_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(920, 930, 930, 920)) # WholePart vs WholeFree

rich_vert_distribution_plot <- 
  ggplot(richness, aes(y = mean, x = fraction)) +
  scale_fill_manual(values = fraction_colors) +
  geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
  scale_y_continuous(limits = c(150,980), breaks = seq(from = 0, to =925, by = 150)) + 
  ylab("Observed Richness") + xlab("Fraction") + 
  scale_shape_manual(values = season_shapes) +
  geom_path(data = rich_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
  annotate("text", x=1.5, y=980, size = 8, color = "#424645", 
           label= paste("***")) +
  annotate("text", x=1.5, y=880, size = 4, color = "#424645",
           label= paste("p =", round(rich_fraction_wilcox$p.value, digits = 5))) +
  theme(legend.position = "none", axis.title.x = element_blank()) 
  



######################################################### EXPONENTIAL SHANNON ######################################################### 
exp_shannon_fraction_wilcox <- wilcox.test(mean ~ fraction, data = shannon)

# Make a data frame to draw significance line in boxplot (visually calculated)
exp_shannon_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(310,315,315,310)) # WholePart vs WholeFree

exp_shannon_vert_distribution_plot <- 
  ggplot(shannon, aes(y = mean, x = fraction)) +
  scale_fill_manual(values = fraction_colors) +
  geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
  scale_y_continuous(limits = c(0,325), breaks = seq(from = 0, to =300, by = 50)) + 
  ylab("Exp(Shannon)") + xlab("Fraction") + 
  scale_shape_manual(values = season_shapes) +
  geom_path(data = exp_shannon_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
  annotate("text", x=1.5, y=325, size = 8, color = "#424645", 
           label= paste("**")) +
  annotate("text", x=1.5, y=300, size = 4, color = "#424645",
           label= paste("p =", round(exp_shannon_fraction_wilcox$p.value, digits = 3))) +
  theme(legend.position = "none", axis.title.x = element_blank()) 
  
######################################################### INVERSE SIMPSON ######################################################### 
invsimps_fraction_wilcox <- wilcox.test(mean ~ fraction, data = invsimps)

# Make a data frame to draw significance line in boxplot (visually calculated)
invsimps_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(83,85,85,83)) # WholePart vs WholeFree

invsimps_vert_distribution_plot <- 
  ggplot(invsimps, aes(y = mean, x = fraction)) +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  geom_jitter(size = 3,  aes(fill = fraction, shape = season), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
  scale_y_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) + 
  ylab("Inverse Simpson") + xlab("Fraction") + 
  geom_path(data = invsimps_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
  annotate("text", x=1.5, y=80, size = 4, color = "#424645", label= "NS") +
  theme(legend.position = "none", axis.title.x = element_blank()) 


pdiv1 <- plot_grid(rich_vert_distribution_plot, exp_shannon_vert_distribution_plot, invsimps_vert_distribution_plot,
          ncol = 3, nrow = 1, labels = c("A", "B", "C"))

pdiv1_boxplots <- plot_grid(pdiv1, season_legend,
           ncol = 1, nrow = 2, 
           rel_heights = c(1, 0.05)); pdiv1_boxplots

pdiv2 <- plot_grid(rich_vert_distribution_plot + theme_minimal() + 
                                    theme(legend.position = "none", axis.title.x = element_blank()), 
                                  exp_shannon_vert_distribution_plot + theme_minimal() + 
                                    theme(legend.position = "none", axis.title.x = element_blank()), 
                                  invsimps_vert_distribution_plot + theme_minimal() + 
                                    theme(legend.position = "none", axis.title.x = element_blank()),
          ncol = 3, nrow = 1, labels = c("A", "B", "C"));

pdiv2_boxplots <- plot_grid(pdiv2, season_legend,
           ncol = 1, nrow = 2, 
           rel_heights = c(1, 0.05)); pdiv2_boxplots

Prepare Figure 2

######################################################### OBSERVED RICHNESS ######################################################### 
rich_fraction_wilcox <- wilcox.test(mean ~ fraction, data = richness)
rich_fraction_wilcox
## 
##  Wilcoxon rank sum test
## 
## data:  mean by fraction
## W = 129, p-value = 0.0004955
## alternative hypothesis: true location shift is not equal to 0
filter(richness) %>%
  group_by(fraction) %>%
  summarize(mean(mean), sd(mean), min(mean), max(mean))
## # A tibble: 2 x 5
##   fraction `mean(mean)` `sd(mean)` `min(mean)` `max(mean)`
##   <fct>           <dbl>      <dbl>       <dbl>       <dbl>
## 1 Particle         557.      168.         344.        909.
## 2 Free             338.       85.5        239.        510.
# Make a data frame to draw significance line in boxplot (visually calculated)
rich_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(920, 930, 930, 920)) # WholePart vs WholeFree

rich_distribution_plot <- 
  ggplot(richness, aes(y = mean, x = fraction)) +
  scale_fill_manual(values = fraction_colors) +
  geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
  scale_y_continuous(limits = c(150,980), breaks = seq(from = 0, to =925, by = 150)) + 
  xlab("Observed Richness") + xlab("Fraction") + 
  scale_shape_manual(values = season_shapes) +
  geom_path(data = rich_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
  annotate("text", x=1.5, y=980, size = 8, color = "#424645", label= paste("***"), angle = 90) +
  annotate("text", x=1.5, y=790, size = 4, color = "#424645",
           label= paste("p =", round(rich_fraction_wilcox$p.value, digits = 5))) +
  theme(legend.position = "none",# axis.title.y = element_blank(),
        axis.title.x = element_blank(), axis.text.x = element_blank()) +
  coord_flip()

################ Richness vs Community-wide (Per-Liter) Production 
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_perliter_rich_PA <- paste("atop(R^2 ==", round(summary(lm_prod_vs_rich_PA)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(summary(lm_prod_vs_rich_PA)$coefficients[,4][2]), digits = 3), ")")

## 2. Plot Richness vs Community-wide (Per-Liter) Production 
prod_vs_rich_plot <-  
  ggplot(richness, aes(x=mean, y=frac_bacprod)) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction)) +  # Y-axis errorbars
  geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  ylab("\n Community Production \n (μgC/L/day)") + 
  xlab("Observed Richness") +
  geom_smooth(data=filter(richness, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = c(150,980), breaks = seq(from = 0, to =925, by = 150)) + 
   # Add the R2 and p-value to the plot 
  annotate("text", x=790, y=45, label=lm_lab_perliter_rich_PA, parse = TRUE, color = "#FF6600", size = 4) +
  theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank()) 


# Summary stats  
summary(lm(frac_bacprod ~ mean, data = filter(richness, fraction == "Particle")))
## 
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(richness, fraction == 
##     "Particle"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.1446 -2.8561 -0.4608  3.7354  9.9395 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -11.167033   5.731339  -1.948  0.07996 . 
## mean          0.037941   0.009891   3.836  0.00329 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.499 on 10 degrees of freedom
## Multiple R-squared:  0.5954, Adjusted R-squared:  0.5549 
## F-statistic: 14.72 on 1 and 10 DF,  p-value: 0.003286
# WITHOUT THE TWO HIGH POINTS 
summary(lm(frac_bacprod ~ mean, data = filter(richness, fraction == "Particle" & mean < 800)))  
## 
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(richness, fraction == 
##     "Particle" & mean < 800))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.7760 -1.6847 -0.6523  2.8624  6.3591 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.43520    8.53971  -0.636    0.542
## mean         0.02559    0.01708   1.498    0.172
## 
## Residual standard error: 4.365 on 8 degrees of freedom
## Multiple R-squared:  0.2191, Adjusted R-squared:  0.1215 
## F-statistic: 2.244 on 1 and 8 DF,  p-value: 0.1725
################ Richness vs Per Capita (Per-Cell) Production 
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_percell_rich_PA <- paste("atop(R^2 ==", round(summary(lm_percell_prod_vs_rich_PA)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(summary(lm_percell_prod_vs_rich_PA)$coefficients[,4][2]), digits = 3), ")")

## 2. Plot Richness vs Per Capita (Per-Cell) Production 
percell_prod_vs_rich_plot <- 
  ggplot(richness, aes(x=mean, y=log10(fracprod_per_cell_noinf))) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_point(size = 3.5,  color = "black", aes(fill = fraction, shape = season)) +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  ylab("\n log10(Per-Capita Production)\n (μgC/cell/day)") +
  xlab("Observed Richness") +
  geom_smooth(data=filter(richness, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = c(150,980), breaks = seq(from = 0, to =925, by = 150)) + 
  #scale_y_continuous(limits = c(-8e-7,5e-6), breaks = seq(from = 0, to = 6e-7, by = 3e-7)) + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=790, y=-7.5, label=lm_lab_percell_rich_PA, parse = TRUE, color = "#FF6600", size = 4) +
  theme(legend.title = element_blank(), legend.position ="bottom", 
        legend.text = element_text(size = 14))

# Summary stats 
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness, fraction == "Particle")))
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness, 
##     fraction == "Particle"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5548 -0.2142 -0.0078  0.1246  0.5942 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.055248   0.372497 -21.625 4.55e-09 ***
## mean         0.002368   0.000636   3.723  0.00475 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.352 on 9 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6064, Adjusted R-squared:  0.5626 
## F-statistic: 13.86 on 1 and 9 DF,  p-value: 0.004746
# WITHOUT THE TWO HIGH POINTS 
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness, fraction == "Particle" & mean < 800)))
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness, 
##     fraction == "Particle" & mean < 800))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.39370 -0.18850 -0.00462  0.16877  0.42371 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7.3565895  0.5415567 -13.584 2.76e-06 ***
## mean         0.0008726  0.0010848   0.804    0.448    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2768 on 7 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.08462,    Adjusted R-squared:  -0.04615 
## F-statistic: 0.6471 on 1 and 7 DF,  p-value: 0.4476
######################################################### SHANNON ENTROPY ######################################################### 
shannon_fraction_wilcox <- wilcox.test(mean ~ fraction, data = shannon)
shannon_fraction_wilcox   
## 
##  Wilcoxon rank sum test
## 
## data:  mean by fraction
## W = 119, p-value = 0.00556
## alternative hypothesis: true location shift is not equal to 0
filter(shannon) %>%
  group_by(fraction) %>%
  summarize(mean(mean), sd(mean))
## # A tibble: 2 x 3
##   fraction `mean(mean)` `sd(mean)`
##   <fct>           <dbl>      <dbl>
## 1 Particle        109.        69.6
## 2 Free             55.9       17.0
# Make a data frame to draw significance line in boxplot (visually calculated)
shannon_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(310, 315, 315, 310)) # WholePart vs WholeFree

shannon_distribution_plot <- 
  ggplot(shannon, aes(y = mean, x = fraction)) +
  scale_fill_manual(values = fraction_colors) +
  geom_jitter(size = 3, aes(shape = season, fill = fraction), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
  scale_y_continuous(limits = c(0,325), breaks = seq(from = 0, to = 300, by = 50)) + 
  xlab("Exp(Shannon)") +  xlab("Fraction") + 
  scale_shape_manual(values = season_shapes) +
    # Add line and pval
  geom_path(data = shannon_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
  annotate("text", x=1.5, y=325, size = 8, color = "#424645", label= paste("**"), angle = 90) +
  annotate("text", x=1.5, y=270, size = 4, color = "#424645",
           label= paste("p =", round(shannon_fraction_wilcox$p.value, digits = 3))) +
  theme(legend.position = "none",# axis.title.y = element_blank(),
        axis.title.x = element_blank(), axis.text.x = element_blank()) +
  coord_flip()

################ Shannon vs Community-wide (Per-Liter) Production 
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_perliter_shannon_PA <- paste("atop(R^2 ==", round(summary(lm_prod_vs_shannon_PA)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(summary(lm_prod_vs_shannon_PA)$coefficients[,4][2]), digits = 3), ")")

## 2. Plot Shannon vs Community-wide (Per-Liter) Production 
prod_vs_shannon_plot <-  
  ggplot(shannon, aes(x=mean, y=frac_bacprod)) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction)) +  # Y-axis errorbars
  geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
  scale_color_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  scale_fill_manual(values = fraction_colors) +
  ylab("\n Community Production \n (μgC/L/day)") + 
  scale_x_continuous(limits = c(0,325), breaks = seq(from = 0, to = 300, by = 50)) + 
  xlab("Exp(Shannon)") +
  geom_smooth(data=filter(shannon, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=250, y=45, label=lm_lab_perliter_shannon_PA, parse = TRUE, color = "#FF6600", size = 4) +
  theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank()) 


################ Shannon vs Per Capitra (Per-Cell) Production 
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_percell_shannon_PA <- paste("atop(R^2 ==", round(summary(lm_percell_prod_vs_shannon_PA)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(summary(lm_percell_prod_vs_shannon_PA)$coefficients[,4][2]), digits = 3), ")")

## 2. Plot Shannon vs Per Capitra (Per-Cell) Production 
percell_prod_vs_shannon_plot <- 
  ggplot(shannon, aes(x=mean, y=log10(fracprod_per_cell_noinf))) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  ylab("\n log10(Per-Capita Production)\n (μgC/cell/day)") +
  xlab("Exp(Shannon)") +
  geom_smooth(data=filter(shannon, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = c(0,325), breaks = seq(from = 0, to = 300, by = 50)) + 
  #scale_y_continuous(limits = c(-8e-7,5e-6), breaks = seq(from = 0, to = 6e-7, by = 3e-7)) + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=250, y=-7.5, label=lm_lab_percell_shannon_PA, parse = TRUE, color = "#FF6600", size = 4) +
  theme(legend.title = element_blank(), legend.position = "none")


shannon_plots <- plot_grid(shannon_distribution_plot, prod_vs_shannon_plot, percell_prod_vs_shannon_plot,
          #labels = c("B", "D", "F"), 
          ncol = 1, nrow = 3,
          rel_heights = c(0.5, 0.8, 1.1),
          align = "v")


######################################################### INVERSE SIMPSON ######################################################### 
invsimps_fraction_wilcox <- wilcox.test(mean ~ fraction, data = invsimps)
invsimps_fraction_wilcox
## 
##  Wilcoxon rank sum test
## 
## data:  mean by fraction
## W = 80, p-value = 0.6707
## alternative hypothesis: true location shift is not equal to 0
filter(invsimps) %>%
  group_by(fraction) %>%
  summarize(mean(mean), sd(mean))
## # A tibble: 2 x 3
##   fraction `mean(mean)` `sd(mean)`
##   <fct>           <dbl>      <dbl>
## 1 Particle         35.5      23.9 
## 2 Free             24.1       8.11
# Make a data frame to draw significance line in boxplot (visually calculated)
invsimps_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(83,85,85,83)) # WholePart vs WholeFree

invsimps_distribution_plot <- 
  ggplot(invsimps, aes(y = mean, x = fraction)) +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  geom_jitter(size = 3,  aes(fill = fraction, shape = season), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
  scale_y_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) + 
  ylab("Inverse Simpson") + xlab("Fraction") + 
  geom_path(data = invsimps_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
  annotate("text", x=1.5, y=80, size = 4, color = "#424645", label= "NS") +
  theme(legend.position = "none", #axis.title.y = element_blank(),
        axis.title.x = element_blank(), axis.text.x = element_blank()) +
  coord_flip()


################ Inverse Simpson vs Community-wide (Per-Liter) Production 
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_perliter_invsimps_PA <- paste("atop(R^2 ==", round(summary(lm_prod_vs_invsimps_PA)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(summary(lm_prod_vs_invsimps_PA)$coefficients[,4][2]), digits = 4), ")")

## 2. Plot Inverse Simpson vs Community-wide (Per-Liter) Production 
prod_vs_invsimps_plot <-  
  ggplot(invsimps, aes(x=mean, y=frac_bacprod)) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction)) +  # Y-axis errorbars
  geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  ylab("Community Production \n (μgC/L/day)") + 
  xlab("Inverse Simpson") +
  geom_smooth(data=filter(invsimps, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=70, y=45, label=lm_lab_perliter_invsimps_PA, parse = TRUE, color = "#FF6600", size = 4) +
  theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank()) 

# Summary stats 
summary(lm(frac_bacprod ~ mean, data = filter(invsimps, fraction == "Particle")))
## 
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(invsimps, fraction == 
##     "Particle"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.5101 -2.0966 -0.1789  0.8421  7.8801 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.42551    2.43457  -0.175 0.864742    
## mean         0.29241    0.05761   5.076 0.000481 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.572 on 10 degrees of freedom
## Multiple R-squared:  0.7204, Adjusted R-squared:  0.6924 
## F-statistic: 25.76 on 1 and 10 DF,  p-value: 0.0004809
# WITHOUT THE TWO HIGH POINTS 
summary(lm(frac_bacprod ~ mean, data = filter(invsimps, fraction == "Particle" & mean < 70)))
## 
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(invsimps, fraction == 
##     "Particle" & mean < 70))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.2949 -1.6336 -0.3733  1.4762  6.5363 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   1.6287     2.4758   0.658   0.5291  
## mean          0.2050     0.0806   2.543   0.0345 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.672 on 8 degrees of freedom
## Multiple R-squared:  0.4471, Adjusted R-squared:  0.378 
## F-statistic: 6.469 on 1 and 8 DF,  p-value: 0.03452
################ Inverse Simpson vs Per Capitra (Per-Cell) Production 
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_percell_invsimps_PA <- paste("atop(R^2 ==", round(summary(lm_percell_prod_vs_invsimps_PA)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(summary(lm_percell_prod_vs_invsimps_PA)$coefficients[,4][2]), digits = 4), ")")

## 2. Plot Inverse Simpson vs Per Capitra (Per-Cell) Production 
percell_prod_vs_invsimps_plot <- 
  ggplot(invsimps, aes(x=mean, y=log10(fracprod_per_cell_noinf), fill = fraction, color = fraction)) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_point(size = 3.5, color = "black", aes(shape = season)) +
  scale_color_manual(values = fraction_colors, guide = TRUE) +
  scale_fill_manual(values = fraction_colors, guide = FALSE) +
  scale_shape_manual(values = season_shapes) +
  ylab("log10(production/cell)\n (μgC/cell/day)") +
  xlab("Inverse Simpson") +
  geom_smooth(data=filter(invsimps, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) + 
  #scale_y_continuous(limits = c(-8e-7,5e-6), breaks = seq(from = 0, to = 6e-7, by = 3e-7)) + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=70, y=-7.5, label=lm_lab_percell_invsimps_PA, parse = TRUE, color = "#FF6600", size = 4) +
  guides(color = guide_legend(override.aes = list(shape = 15))) +
  theme(legend.title = element_blank(), legend.position ="bottom", 
        legend.text = element_text(size = 14))

# Summary stats 
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps, fraction == "Particle")))
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps, 
##     fraction == "Particle"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.28933 -0.18289 -0.11341  0.07465  0.56333 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7.358363   0.159683  -46.08 5.34e-12 ***
## mean         0.018005   0.003758    4.79 0.000987 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2978 on 9 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.7183, Adjusted R-squared:  0.687 
## F-statistic: 22.95 on 1 and 9 DF,  p-value: 0.0009868
# WITHOUT THE TWO HIGH POINTS 
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps, fraction == "Particle" & mean < 70)))
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps, 
##     fraction == "Particle" & mean < 70))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.23904 -0.19052 -0.03487  0.06990  0.49055 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7.172409   0.164716 -43.544  8.8e-10 ***
## mean         0.009523   0.005573   1.709    0.131    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.243 on 7 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2944, Adjusted R-squared:  0.1936 
## F-statistic:  2.92 on 1 and 7 DF,  p-value: 0.1312
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps, fraction == "Particle" & mean < 70 & log10(fracprod_per_cell_noinf) < 1e-7)))
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps, 
##     fraction == "Particle" & mean < 70 & log10(fracprod_per_cell_noinf) < 
##         1e-07))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.23904 -0.19052 -0.03487  0.06990  0.49055 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7.172409   0.164716 -43.544  8.8e-10 ***
## mean         0.009523   0.005573   1.709    0.131    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.243 on 7 degrees of freedom
## Multiple R-squared:  0.2944, Adjusted R-squared:  0.1936 
## F-statistic:  2.92 on 1 and 7 DF,  p-value: 0.1312
# Plot Inverse simpson plots altogether
invsimps_plots <- plot_grid(invsimps_distribution_plot, prod_vs_invsimps_plot, percell_prod_vs_invsimps_plot,
          labels = c("B", "D", "F"), ncol = 1, nrow = 3,
          rel_heights = c(0.5, 0.8, 1.1),
          align = "v")

Figure 2

# All plots together 
rich_plots <- plot_grid(rich_distribution_plot, prod_vs_rich_plot, 
                        percell_prod_vs_rich_plot + theme(legend.position = "none"),
          labels = c("A", "D", "G"), ncol = 1, nrow = 3,
          rel_heights = c(0.5, 0.8, 1),
          align = "v")


shannon_plots_noyaxis <- plot_grid(shannon_distribution_plot + 
                                     theme(axis.title.y = element_blank(), axis.text.y = element_blank(),
                                           #  specify top, right, bottom, and left margins
                                           plot.margin = margin(0, 0, 0, 0.6, "cm")), 
          prod_vs_shannon_plot + theme(axis.title.y = element_blank(), axis.text.y = element_blank()), 
          percell_prod_vs_shannon_plot + theme(axis.title.y = element_blank(), axis.text.y = element_blank(), legend.position = "none"),
          labels = c("B", "E", "H"), 
          ncol = 1, nrow = 3,
          rel_heights = c(0.5, 0.8, 1),
          align = "v")

invsimps_plots_noyaxis <- 
  plot_grid(invsimps_distribution_plot + 
                                     theme(axis.title.y = element_blank(), axis.text.y = element_blank(),
                                           #  specify top, right, bottom, and left margins
                                           plot.margin = margin(0, 0, 0, 0.6, "cm")), 
            prod_vs_invsimps_plot + theme(axis.title.y = element_blank(),  axis.text.y = element_blank()), 
            percell_prod_vs_invsimps_plot + theme(axis.title.y = element_blank(), axis.text.y = element_blank(), legend.position = "none"), 
          labels = c("C", "F", "I"), ncol = 1, nrow = 3,
          rel_heights = c(0.5, 0.8, 1),
          align = "v")

# Extract distribution plots of richness and inverse simpson
figure2_row1 <- plot_grid(rich_plots, shannon_plots_noyaxis, invsimps_plots_noyaxis,
          ncol = 3, nrow = 1, rel_widths = c(1, 0.75, 0.75),
          align = "h")


######## PLOT FIGURE 2
plot_grid(figure2_row1, season_legend,
                   ncol = 1, nrow = 2, 
                   rel_heights = c(1, 0.05))

Figure 3

Unweighted ses.mpd

######################################################### SES MPD DISTRIBUTION: UNWEIGHTED  



################ Figure 3A: Distribution of Particle and Free Unweighted Mean Pairwise distance
## 1. Is there a difference between particle and free Unweighted Mean Pairwise distance?
unweighted_fraction_wilcox <- wilcox.test(mpd.obs.z ~ fraction, data = unweighted_df)
unweighted_fraction_wilcox
## 
##  Wilcoxon rank sum test
## 
## data:  mpd.obs.z by fraction
## W = 31, p-value = 0.01727
## alternative hypothesis: true location shift is not equal to 0
# 2. What are the means of particle and free unweighted PD?
unweighted_df %>%
  group_by(fraction) %>%
  summarize(mean(mpd.obs.z))
## # A tibble: 2 x 2
##   fraction `mean(mpd.obs.z)`
##   <fct>                <dbl>
## 1 Particle            -0.488
## 2 Free                 0.432
## 3. Plot Unweighted Phylogenetic Diversity vs Richness  
# Make a data frame to draw significance line in boxplot (visually calculated)
dat4 <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(1.6,1.7,1.7,1.6)) # WholePart vs WholeFree

unweight_distribution_plot <- 
  ggplot(unweighted_df, aes(y = mpd.obs.z, x = fraction)) +
  scale_fill_manual(values = fraction_colors) +
  geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
  scale_shape_manual(values = season_shapes) + 
  scale_y_continuous(limits = c(-1.5,1.85), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) + 
  ylab("Standardized Effect Size \n Unweighted Mean Pairwise Dist") +
  xlab("Fraction") + 
  geom_hline(yintercept = 0, linetype = "dashed", size = 1.5) +
  ##### Particle vs free per-cell production 
  geom_path(data = dat4, aes(x = a, y = b), linetype = 1, color = "#424645") +
  annotate("text", x=1.5, y=1.85, size = 8, color = "#424645", label= paste("*"), angle = 90) +
  annotate("text", x=1.5, y=1.35, size = 4, color = "#424645",
           label= paste("p =", round(unweighted_fraction_wilcox$p.value, digits = 2))) +
  theme(legend.position = "none",# axis.title.y = element_blank(),
        axis.title.x = element_blank(), axis.text.x = element_blank()) +
  coord_flip()


################ Figure 3B: Unweighted Phylogenetic Diversity vs Richness  
## 1. Is there a relationship between richness and Unweighted Mean Pairwise distance?
lm_richness_vs_unweight <- lm(mean ~ mpd.obs.z, data = unweighted_df)

  # Linear model results for particle-associated only 
  summary(lm(mean ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Particle")))
## 
## Call:
## lm(formula = mean ~ mpd.obs.z, data = filter(unweighted_df, fraction == 
##     "Particle"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -189.25 -112.84  -41.51   58.66  277.56 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   515.93      50.46  10.224  1.3e-06 ***
## mpd.obs.z     -83.81      48.58  -1.725    0.115    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 154.4 on 10 degrees of freedom
## Multiple R-squared:  0.2294, Adjusted R-squared:  0.1523 
## F-statistic: 2.977 on 1 and 10 DF,  p-value: 0.1152
  # Linear model results for free-living only 
  summary(lm(mean ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Free")))
## 
## Call:
## lm(formula = mean ~ mpd.obs.z, data = filter(unweighted_df, fraction == 
##     "Free"))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -98.56 -64.51 -19.87  45.52 173.60 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   332.73      42.64   7.803 1.46e-05 ***
## mpd.obs.z      12.70      78.55   0.162    0.875    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 89.57 on 10 degrees of freedom
## Multiple R-squared:  0.002607,   Adjusted R-squared:  -0.09713 
## F-statistic: 0.02614 on 1 and 10 DF,  p-value: 0.8748
## 2. Extract the R2 and p-value from the linear model: 
lm_lab_richness_vs_unweight<- paste("atop(R^2 ==", round(summary(lm_richness_vs_unweight)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(summary(lm_richness_vs_unweight)$coefficients[,4][2]), digits = 3), ")")

## 3. Plot Unweighted Phylogenetic Diversity vs Richness  
unweight_rich_vs_mpd_plot <- 
  ggplot(unweighted_df, aes(y = mean, x = mpd.obs.z)) +
  geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_vline(xintercept = 0, linetype = "dashed", size = 1.5) +
  geom_point(size = 3, aes(fill = fraction, shape = season)) + ylab("\n Observed  Richness") +
  scale_shape_manual(values = season_shapes) + 
  xlab("Standardized Effect Size \n Unweighted Mean Pairwise Dist") +
  scale_fill_manual(values = fraction_colors) +
  geom_smooth(method = "lm", color = "#424645", fill = "#424645", alpha = 0.3) + 
  scale_x_continuous(limits = c(-1.5,1.85), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=0.75, y=750, label=lm_lab_richness_vs_unweight, parse = TRUE, color = "#424645", size = 4) +
  theme(legend.title = element_blank(), legend.position = "none", 
        axis.text.x = element_blank(), axis.title.x = element_blank())


################ Figure 3C: Unweighted Mean Pairwise distance vs Community-Wide Production
# Is there a relationship between Production and Unweighted Mean Pairwise distance?
summary(lm(frac_bacprod ~ mpd.obs.z, data = unweighted_df)) # NS 
## 
## Call:
## lm(formula = frac_bacprod ~ mpd.obs.z, data = unweighted_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.251  -9.636  -3.128   8.298  44.287 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   17.110      3.111   5.499 1.59e-05 ***
## mpd.obs.z      3.621      3.754   0.965    0.345    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.23 on 22 degrees of freedom
## Multiple R-squared:  0.04058,    Adjusted R-squared:  -0.003031 
## F-statistic: 0.9305 on 1 and 22 DF,  p-value: 0.3452
  # Linear model results for particle-associated only 
  summary(lm(frac_bacprod ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Particle")))
## 
## Call:
## lm(formula = frac_bacprod ~ mpd.obs.z, data = filter(unweighted_df, 
##     fraction == "Particle"))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.340 -4.276 -1.717  3.441 19.484 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    8.288      2.593   3.196  0.00955 **
## mpd.obs.z     -3.425      2.496  -1.372  0.19998   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.931 on 10 degrees of freedom
## Multiple R-squared:  0.1585, Adjusted R-squared:  0.07431 
## F-statistic: 1.883 on 1 and 10 DF,  p-value: 0.2
  # Linear model results for free-living only 
  summary(lm(frac_bacprod ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Free")))
## 
## Call:
## lm(formula = frac_bacprod ~ mpd.obs.z, data = filter(unweighted_df, 
##     fraction == "Free"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -22.343 -14.675   2.146   8.551  37.309 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   18.604      8.481   2.194    0.053 .
## mpd.obs.z     12.634     15.621   0.809    0.437  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.81 on 10 degrees of freedom
## Multiple R-squared:  0.06139,    Adjusted R-squared:  -0.03247 
## F-statistic: 0.6541 on 1 and 10 DF,  p-value: 0.4375
unweight_prod_vs_mpd_plot <- 
  ggplot(unweighted_df, aes(y = frac_bacprod, x = mpd.obs.z, fill = fraction)) +
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_vline(xintercept = 0, linetype = "dashed", size = 1.5) +
  geom_point(size = 3, aes(shape = season)) +
  scale_shape_manual(values = season_shapes) + 
  ylab("\n Community Production \n (μgC/L/day)") +
  xlab("Standardized Effect Size \n Unweighted Mean Pairwise Dist") +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  scale_x_continuous(limits = c(-1.5,1.85), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) + 
  theme(legend.title = element_blank(), legend.position = "none",
        axis.text.x = element_blank(), axis.title.x = element_blank())



################ Figure 3D: Unweighted Phylogenetic Diversity vs Per Capita (Per-Cell) Production 
## 1. Run the linear model: Is there a relationship between PER-CELL PRODUCTION and Unweighted Mean Pairwise distance?
unweight_vs_percell <- lm(log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = unweighted_df)
summary(unweight_vs_percell)
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = unweighted_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7079 -0.2792  0.0096  0.1331  1.3149 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7.19865    0.09971 -72.198  < 2e-16 ***
## mpd.obs.z   -0.49577    0.11961  -4.145  0.00046 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4769 on 21 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:   0.45,  Adjusted R-squared:  0.4238 
## F-statistic: 17.18 on 1 and 21 DF,  p-value: 0.0004595
summary(aov(log10(fracprod_per_cell_noinf) ~ mpd.obs.z +fraction, data = unweighted_df))
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## mpd.obs.z    1  3.908   3.908   21.08 0.000177 ***
## fraction     1  1.070   1.070    5.77 0.026131 *  
## Residuals   20  3.707   0.185                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1 observation deleted due to missingness
summary(aov(log10(fracprod_per_cell_noinf) ~ fraction + mpd.obs.z, data = unweighted_df))
##             Df Sum Sq Mean Sq F value  Pr(>F)    
## fraction     1  4.139   4.139  22.330 0.00013 ***
## mpd.obs.z    1  0.838   0.838   4.523 0.04608 *  
## Residuals   20  3.707   0.185                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1 observation deleted due to missingness
lmer(log10(fracprod_per_cell_noinf) ~ mpd.obs.z + (fraction) +
     (0+dummy(fraction, "Particle")), data = Orthodont)
## Error in eval(predvars, data, env): object 'fracprod_per_cell_noinf' not found
library(lme4)
# Look for the marginal R2 
null<-lmer(log10(fracprod_per_cell_noinf)~(1|fraction),data=unweighted_df)
full<-lmer(log10(fracprod_per_cell_noinf)~mpd.obs.z+(1| fraction),data=unweighted_df)
anova(null,full)
## Data: unweighted_df
## Models:
## null: log10(fracprod_per_cell_noinf) ~ (1 | fraction)
## full: log10(fracprod_per_cell_noinf) ~ mpd.obs.z + (1 | fraction)
##      Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)  
## null  3 40.590 43.997 -17.295   34.590                           
## full  4 36.268 40.810 -14.134   28.268 6.3225      1    0.01192 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(full)
## Analysis of Variance Table
##           Df Sum Sq Mean Sq F value
## mpd.obs.z  1  1.128   1.128  6.0853
summary(lm(log10(fracprod_per_cell_noinf) ~ fraction, data = unweighted_df))
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ fraction, data = unweighted_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.64696 -0.35864  0.04763  0.17090  1.25132 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -6.7258     0.1403 -47.946  < 2e-16 ***
## fractionFree  -0.8492     0.1942  -4.373 0.000266 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4652 on 21 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.4766, Adjusted R-squared:  0.4517 
## F-statistic: 19.12 on 1 and 21 DF,  p-value: 0.0002664
  # Linear model results for particle-associated only 
  summary(lm(log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Particle")))
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = filter(unweighted_df, 
##     fraction == "Particle"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.37599 -0.24344 -0.16715  0.05839  1.17655 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6.9205     0.1684 -41.096 1.49e-11 ***
## mpd.obs.z    -0.3264     0.1583  -2.062   0.0693 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4624 on 9 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3208, Adjusted R-squared:  0.2453 
## F-statistic: 4.251 on 1 and 9 DF,  p-value: 0.06928
  # Linear model results for free-living only 
  summary(lm(log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Free")))
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = filter(unweighted_df, 
##     fraction == "Free"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.63338 -0.30053  0.08886  0.16667  0.76862 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7.54640    0.19666 -38.372 3.45e-12 ***
## mpd.obs.z   -0.06636    0.36224  -0.183    0.858    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4131 on 10 degrees of freedom
## Multiple R-squared:  0.003345,   Adjusted R-squared:  -0.09632 
## F-statistic: 0.03356 on 1 and 10 DF,  p-value: 0.8583
## 2. Extract the R2 and p-value from the linear model: 
lm_lab_percell_unweightPD <- paste("atop(R^2 ==", round(summary(unweight_vs_percell)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(summary(unweight_vs_percell)$coefficients[,4][2]), digits = 4), ")")

## 3. Plot Unweighted Phylogenetic Diversity vs Per Capitra (Per-Cell) Production 
unweight_percell_vs_mpd_plot <- 
  ggplot(unweighted_df, 
       aes(y = log10(fracprod_per_cell_noinf), x = mpd.obs.z)) +
  geom_vline(xintercept = 0, linetype = "dashed", size = 1.5) +
  geom_point(size = 3,  aes(fill = fraction, shape = season)) + 
  ylab("\n log10(Per-Capita Production)\n (μgC/cell/day)") +
  xlab("Unweighted Phylogenetic Diversity") +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) + 
  geom_smooth(method = "lm", color="#424645", fill = "#424645", alpha = 0.3) +
  scale_y_continuous(limits = c(-8.5,-5), breaks = c(-8, -7, -6)) + 
  scale_x_continuous(limits = c(-1.5,1.85), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=0.75, y=-6, label=lm_lab_percell_unweightPD, parse = TRUE, color = "#424645", size = 4) +
  theme(legend.title = element_blank(), legend.position ="bottom", 
        legend.text = element_text(size = 14))





######## FIGURE 3
figure3_row1 <- plot_grid(unweight_distribution_plot, unweight_rich_vs_mpd_plot, unweight_prod_vs_mpd_plot, 
          unweight_percell_vs_mpd_plot + theme(legend.position = "none"), 
          labels = c("A", "B", "C", "D"), ncol = 1, nrow = 4,
          rel_heights = c(0.5, 1, 1, 1.2),
          align = "v")

plot_grid(figure3_row1, season_legend,
                   ncol = 1, nrow = 2, 
                   rel_heights = c(1, 0.05))

MPD + Shannon + Inverse Simpson

# EXPONENTIAL SHANNON
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_perliter_shannon_PA <- paste("atop(R^2 ==", round(summary(lm_prod_vs_shannon_PA)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(summary(lm_prod_vs_shannon_PA)$coefficients[,4][2]), digits = 3), ")")

## 2. Plot Exp Shannon vs Community-wide (Per-Liter) Production 
ggplot(shannon, aes(x=mean, y=frac_bacprod)) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction)) +  # Y-axis errorbars
  geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  ylab("Community Production \n (μgC/L/day)") + 
  xlab("Exponential Shannon") +
  geom_smooth(data=filter(shannon, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = c(0,300), breaks = seq(from = 0, to = 300, by = 50)) + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=250, y=45, label=lm_lab_perliter_shannon_PA, parse = TRUE, color = "#FF6600", size = 4) +
  theme(legend.position = "none") 

# INVERSE SIMPSON 
ggplot(invsimps, aes(x=mean, y=frac_bacprod)) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction)) +  # Y-axis errorbars
  geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  ylab("Community Production \n (μgC/L/day)") + 
  xlab("Inverse Simpson") +
  geom_smooth(data=filter(invsimps, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=70, y=45, label=lm_lab_perliter_invsimps_PA, parse = TRUE, color = "#FF6600", size = 4) +
  theme(legend.position = "none") 

Lasso Regressions

Prepare the data for Lasso Regressions

unweight_vals <- unweighted_df %>%  
  dplyr::select(mpd.obs.z, norep_filter_name) %>%
  rename(Unweighted_PD = mpd.obs.z)
   
weighted_vals <- weighted_df %>%
  dplyr::select(mpd.obs.z, norep_filter_name) %>%
  rename(Weighted_PD = mpd.obs.z)


wide_all_divs <- otu_alphadiv %>%
  dplyr::select(norep_filter_name ,mean, measure) %>%
  spread(measure, mean)

lasso_data_df <- metadata_pca %>%
  left_join(wide_all_divs, by = "norep_filter_name") %>%
  left_join(unweight_vals, by = "norep_filter_name") %>%
  left_join(weighted_vals, by = "norep_filter_name") %>%
  dplyr::select(-c(project, year, Date, limnion, norep_water_name, dnaconcrep1, perc_attached_bacprod,
                   SD_perc_attached_bacprod, SE_total_bac_abund, SE_perc_attached_abund, SE_attached_bac))
## Warning: Column `norep_filter_name` joining character vector and factor, coercing into character vector
### PARTICLE DATA 
lasso_data_df_particle <- lasso_data_df %>%
  filter(fraction == "Particle" ) %>%
  dplyr::select(-fraction)

lasso_data_df_particle_noprod <- lasso_data_df_particle %>%
  dplyr::select(-c(fracprod_per_cell, fracprod_per_cell_noinf, 
                   tot_bacprod, SD_tot_bacprod,SD_frac_bacprod, 
                   norep_filter_name, BGA_cellspermL, Turb_NTU,
                   lakesite, season, station)) 
  
percell_lasso_data_df_particle_noprod <- lasso_data_df_particle %>%
  dplyr::select(-c(fracprod_per_cell, frac_bacprod,
                   tot_bacprod, SD_tot_bacprod,SD_frac_bacprod, 
                   lakesite, season, norep_filter_name, station, BGA_cellspermL, Turb_NTU)) %>%
  dplyr::filter(Temp_C != 14.28) # Remove the missing row of data :( )



### FREE DATA 
lasso_data_df_free <- lasso_data_df %>%
  filter(fraction == "Free" ) %>%
  dplyr::select(-fraction)

lasso_data_df_free_noprod <- lasso_data_df_free %>%
  dplyr::select(-c(fracprod_per_cell, fracprod_per_cell_noinf, 
                   tot_bacprod, SD_tot_bacprod,SD_frac_bacprod, 
                   norep_filter_name, BGA_cellspermL, Turb_NTU,
                   lakesite, season, station)) 
  
percell_lasso_data_df_free_noprod <- lasso_data_df_free %>%
  dplyr::select(-c(fracprod_per_cell, frac_bacprod,
                   tot_bacprod, SD_tot_bacprod,SD_frac_bacprod, 
                   lakesite, season, norep_filter_name, station, BGA_cellspermL, Turb_NTU)) %>%
  dplyr::filter(Temp_C != 14.28) # Remove the missing row of data :( )



## ALL DATA 
all_dat_lasso_comm <- lasso_data_df %>%
    dplyr::select(-c(fracprod_per_cell, fracprod_per_cell_noinf, 
                   tot_bacprod, SD_tot_bacprod,SD_frac_bacprod, 
                   norep_filter_name, BGA_cellspermL, Turb_NTU,
                   lakesite, season, station, fraction)) 
  

all_dat_lasso_percapita <- lasso_data_df  %>%
  dplyr::select(-c(fracprod_per_cell, frac_bacprod,
                   tot_bacprod, SD_tot_bacprod,SD_frac_bacprod, fraction,
                   lakesite, season, norep_filter_name, station, BGA_cellspermL, Turb_NTU)) %>%
  dplyr::filter(Temp_C != 14.28) # Remove the missing row of data :( )

Perform Lasso regression

Bulk Community Production: Particle

# Set the seed for reproducibility of the grid values 
set.seed(777) 

################ PREPARE DATA ################ 
# Subset data needed and scale all o
scaled_comm_data <- 
  lasso_data_df_particle_noprod %>%    # Use data only for the particle samples  
  scale() %>%                          # Scale all of the variables to have mean =0 and sd = 1
  as.data.frame()                      # Make it a dataframe so that model.matrix function works (does not take a matrix)

# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x = model.matrix(frac_bacprod ~ ., scaled_comm_data)[,-1]
y = scaled_comm_data$frac_bacprod
grid = 10^seq(10,-2,length = 100)

################ PREPARE TRAINING & TESTING DATA FOR CROSS VALIDATION ################ 
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model 
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
y_test <- y[test]

################ LASSO ################ 
# Run lasso regression with alpha = 1
lasso_divs_train <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = FALSE)
par(mfrow = c(1,2))
plot(lasso_divs_train)
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique 'x' values
# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv_lasso_divs)

best_lasso_lambda <- cv_lasso_divs$lambda.min # Minimum lambda
# https://stats.stackexchange.com/questions/138569/why-is-lambda-plus-1-standard-error-a-recommended-value-for-lambda-in-an-elastic
lasso_lambda_1se <- cv_lasso_divs$lambda.1se # simplest model whose accuracy is comparable with the best model
best_lasso_lambda == lasso_lambda_1se
## [1] FALSE
lasso_divs_pred <- predict(lasso_divs_train, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2) # Test
## [1] 0.9944787
## Run lasso on the entire dataset with the best lambda value 
lasso_divs <- glmnet(x, y, alpha = 1, lambda = best_lasso_lambda, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)
## 30 x 1 sparse Matrix of class "dgCMatrix"
##                                 1
## (Intercept)         -7.929167e-17
## Sample_depth_m       .           
## Temp_C               .           
## SpCond_uSpercm       .           
## TDS_mgperL           .           
## pH                   .           
## ORP_mV               .           
## Chl_Lab_ugperL       .           
## Cl_mgperL           -7.450631e-03
## SO4_mgperL           .           
## NO3_mgperL           .           
## NH3_mgperL           .           
## TKN_mgperL           .           
## SRP_ugperL           .           
## TP_ugperL            .           
## Alk_mgperL           .           
## DO_mgperL            .           
## DO_percent           .           
## total_bac_abund     -1.291309e-01
## attached_bac         .           
## perc_attached_abund  .           
## fraction_bac_abund   .           
## PC1                  .           
## PC2                  .           
## Richness             .           
## Exponential_Shannon  .           
## Inverse_Simpson      5.883226e-01
## Simpsons_Evenness    .           
## Unweighted_PD        .           
## Weighted_PD         -4.023568e-02
## Now, run the most simple, parsimonious lasso model within 1 standard error  
## IMPORTANT NOTE: This is the lasso reported within the manuscript
lasso_divs_1se <- glmnet(x, y, alpha = 1, lambda = lasso_lambda_1se, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(lasso_divs_1se, type = "coefficients", s = lasso_lambda_1se)
## 30 x 1 sparse Matrix of class "dgCMatrix"
##                                 1
## (Intercept)         -4.556091e-17
## Sample_depth_m       .           
## Temp_C               .           
## SpCond_uSpercm       .           
## TDS_mgperL           .           
## pH                   .           
## ORP_mV               .           
## Chl_Lab_ugperL       .           
## Cl_mgperL            .           
## SO4_mgperL           .           
## NO3_mgperL           .           
## NH3_mgperL           .           
## TKN_mgperL           .           
## SRP_ugperL           .           
## TP_ugperL            .           
## Alk_mgperL           .           
## DO_mgperL            .           
## DO_percent           .           
## total_bac_abund      .           
## attached_bac         .           
## perc_attached_abund  .           
## fraction_bac_abund   .           
## PC1                  .           
## PC2                  .           
## Richness             .           
## Exponential_Shannon  .           
## Inverse_Simpson      3.261442e-01
## Simpsons_Evenness    .           
## Unweighted_PD        .           
## Weighted_PD          .

The lasso model uses Inverse Simpson as the best and only predictor of particle-associated production!

Community-Wide Production: Free

# Set the seed for reproducibility of the grid values 
set.seed(777) 
     
################ PREPARE DATA ################ 
# Subset data needed and scale all o
scaled_comm_data_free <- 
  lasso_data_df_free_noprod %>%        # Use data only for the free-living samples  
  scale() %>%                          # Scale all of the variables to have mean = 0 and sd = 1
  as.data.frame()                      # Make it a dataframe so that model.matrix function works (does not take a matrix)

# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x <- model.matrix(frac_bacprod ~ ., scaled_comm_data_free)[,-1]
y <- scaled_comm_data_free$frac_bacprod
grid <- 10^seq(10,-2,length = 100)

################ PREPARE TRAINING & TESTING DATA FOR CROSS VALIDATION ################ 
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model 
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
y_test <- y[test]

################ LASSO ################ 
# Run lasso regression with alpha = 1
lasso_divs_train_free_comm <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = FALSE)
par(mfrow = c(1,2))
plot(lasso_divs_train_free_comm)
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique 'x' values
# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv_lasso_divs)

best_lasso_lambda <- cv_lasso_divs$lambda.min
# https://stats.stackexchange.com/questions/138569/why-is-lambda-plus-1-standard-error-a-recommended-value-for-lambda-in-an-elastic
lasso_lambda_1se <- cv_lasso_divs$lambda.1se # simplest model whose accuracy is comparable with the best model
best_lasso_lambda == lasso_lambda_1se
## [1] FALSE
lasso_divs_pred <- predict(lasso_divs_train_free_comm, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2) # Test
## [1] 1.491743
## Run lasso on the entire dataset with the best lambda value 
lasso_divs <- glmnet(x, y, alpha = 1, lambda = best_lasso_lambda, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)
## 30 x 1 sparse Matrix of class "dgCMatrix"
##                                 1
## (Intercept)         -5.076392e-16
## Sample_depth_m       .           
## Temp_C               .           
## SpCond_uSpercm       .           
## TDS_mgperL           .           
## pH                  -2.606707e-01
## ORP_mV               .           
## Chl_Lab_ugperL       .           
## Cl_mgperL            .           
## SO4_mgperL           .           
## NO3_mgperL           .           
## NH3_mgperL           .           
## TKN_mgperL           .           
## SRP_ugperL           .           
## TP_ugperL            .           
## Alk_mgperL           .           
## DO_mgperL            .           
## DO_percent           .           
## total_bac_abund      .           
## attached_bac         .           
## perc_attached_abund  .           
## fraction_bac_abund   .           
## PC1                  .           
## PC2                  .           
## Richness             .           
## Exponential_Shannon  .           
## Inverse_Simpson      .           
## Simpsons_Evenness    .           
## Unweighted_PD        .           
## Weighted_PD          .
## Now, run the most simple, parsimonious lasso model within 1 standard error  
## IMPORTANT NOTE: This is the lasso reported within the manuscript
lasso_divs_1se <- glmnet(x, y, alpha = 1, lambda = lasso_lambda_1se, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(lasso_divs_1se, type = "coefficients", s = lasso_lambda_1se)
## 30 x 1 sparse Matrix of class "dgCMatrix"
##                                 1
## (Intercept)         -4.127014e-16
## Sample_depth_m       .           
## Temp_C               .           
## SpCond_uSpercm       .           
## TDS_mgperL           .           
## pH                  -2.144478e-01
## ORP_mV               .           
## Chl_Lab_ugperL       .           
## Cl_mgperL            .           
## SO4_mgperL           .           
## NO3_mgperL           .           
## NH3_mgperL           .           
## TKN_mgperL           .           
## SRP_ugperL           .           
## TP_ugperL            .           
## Alk_mgperL           .           
## DO_mgperL            .           
## DO_percent           .           
## total_bac_abund      .           
## attached_bac         .           
## perc_attached_abund  .           
## fraction_bac_abund   .           
## PC1                  .           
## PC2                  .           
## Richness             .           
## Exponential_Shannon  .           
## Inverse_Simpson      .           
## Simpsons_Evenness    .           
## Unweighted_PD        .           
## Weighted_PD          .

The lasso model uses pH as the best and only predictor of free-living production!

Per-capita Production: Particle

scaled_percapita_data <- percell_lasso_data_df_particle_noprod %>%   
  dplyr::filter(!is.na(fracprod_per_cell_noinf)) %>%
  mutate(log10_percell = log10(fracprod_per_cell_noinf)) %>%
  dplyr::select(-c(fracprod_per_cell_noinf)) %>%
  scale() %>%
  as.data.frame()
   
set.seed(777)

# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x = model.matrix(log10_percell ~ ., scaled_percapita_data)[,-1]
y = scaled_percapita_data$log10_percell
grid = 10^seq(10,-2,length = 100)

# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model 
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
y_test <- y[test]

################ LASSO
# Run lasso regression with alpha = 1
lasso_divs_train <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = FALSE)
par(mfrow = c(1,2)) 
plot(lasso_divs_train)
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique 'x' values
# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv_lasso_divs)

best_lasso_lambda <- cv_lasso_divs$lambda.min
# https://stats.stackexchange.com/questions/138569/why-is-lambda-plus-1-standard-error-a-recommended-value-for-lambda-in-an-elastic
lasso_lambda_1se <- cv_lasso_divs$lambda.1se # simplest model whose accuracy is comparable with the best model
best_lasso_lambda == lasso_lambda_1se
## [1] FALSE
lasso_divs_pred <- predict(lasso_divs_train, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2)
## [1] 1.070695
## Run lasso on the entire dataset with the best lasso 
lasso_divs <- glmnet(x, y, alpha = 1, lambda = best_lasso_lambda, standardize = FALSE)
# What are the lasso coefficients?
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)
## 30 x 1 sparse Matrix of class "dgCMatrix"
##                                 1
## (Intercept)         -6.969802e-16
## Sample_depth_m       .           
## Temp_C              -1.265111e-01
## SpCond_uSpercm       .           
## TDS_mgperL           .           
## pH                   .           
## ORP_mV               .           
## Chl_Lab_ugperL       .           
## Cl_mgperL            .           
## SO4_mgperL           .           
## NO3_mgperL           .           
## NH3_mgperL           .           
## TKN_mgperL           .           
## SRP_ugperL           .           
## TP_ugperL            .           
## Alk_mgperL           .           
## DO_mgperL            .           
## DO_percent           .           
## total_bac_abund      .           
## attached_bac         .           
## perc_attached_abund  .           
## fraction_bac_abund   .           
## PC1                  .           
## PC2                  .           
## Richness             .           
## Exponential_Shannon  .           
## Inverse_Simpson      3.877871e-01
## Simpsons_Evenness    .           
## Unweighted_PD        .           
## Weighted_PD          .
## Now, run the most simple, parsimonious lasso model within 1 standard error  
lasso_divs_1se <- glmnet(x, y, alpha = 1, lambda = lasso_lambda_1se, standardize = FALSE)
# What are the lasso coefficients?
predict(lasso_divs_1se, type = "coefficients", s = lasso_lambda_1se)
## 30 x 1 sparse Matrix of class "dgCMatrix"
##                                 1
## (Intercept)         -6.909360e-16
## Sample_depth_m       .           
## Temp_C              -9.140559e-02
## SpCond_uSpercm       .           
## TDS_mgperL           .           
## pH                   .           
## ORP_mV               .           
## Chl_Lab_ugperL       .           
## Cl_mgperL            .           
## SO4_mgperL           .           
## NO3_mgperL           .           
## NH3_mgperL           .           
## TKN_mgperL           .           
## SRP_ugperL           .           
## TP_ugperL            .           
## Alk_mgperL           .           
## DO_mgperL            .           
## DO_percent           .           
## total_bac_abund      .           
## attached_bac         .           
## perc_attached_abund  .           
## fraction_bac_abund   .           
## PC1                  .           
## PC2                  .           
## Richness             .           
## Exponential_Shannon  .           
## Inverse_Simpson      3.526807e-01
## Simpsons_Evenness    .           
## Unweighted_PD        .           
## Weighted_PD          .

Temperature and Inverse Simpson’s Index are the two best predictors of particle-associated per-capita production.

Per-capita Production: Free

scaled_percapita_data_free <- percell_lasso_data_df_free_noprod %>%    
  dplyr::filter(!is.na(fracprod_per_cell_noinf)) %>%
  mutate(log10_percell = log10(fracprod_per_cell_noinf)) %>%
  dplyr::select(-c(fracprod_per_cell_noinf)) %>%
  scale() %>%
  as.data.frame()

set.seed(777)

# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x = model.matrix(log10_percell ~ ., scaled_percapita_data_free)[,-1]
y = scaled_percapita_data_free$log10_percell
grid = 10^seq(10,-2,length = 100)

# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model 
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
y_test <- y[test]

################ LASSO
# Run lasso regression with alpha = 1
lasso_divs_train <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = FALSE)
par(mfrow = c(1,2))
plot(lasso_divs_train)
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique 'x' values
# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv_lasso_divs)

best_lasso_lambda <- cv_lasso_divs$lambda.min
# https://stats.stackexchange.com/questions/138569/why-is-lambda-plus-1-standard-error-a-recommended-value-for-lambda-in-an-elastic
lasso_lambda_1se <- cv_lasso_divs$lambda.1se # simplest model whose accuracy is comparable with the best model
best_lasso_lambda == lasso_lambda_1se
## [1] TRUE
lasso_divs_pred <- predict(lasso_divs_train, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2)
## [1] 1.409333
## Run lasso on the entire dataset 
lasso_divs <- glmnet(x, y, alpha = 1, lambda = best_lasso_lambda, standardize = FALSE)
# What are the lasso coefficients?
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)
## 30 x 1 sparse Matrix of class "dgCMatrix"
##                                 1
## (Intercept)          2.872469e-15
## Sample_depth_m       .           
## Temp_C               .           
## SpCond_uSpercm       .           
## TDS_mgperL           .           
## pH                  -3.752725e-01
## ORP_mV               .           
## Chl_Lab_ugperL       .           
## Cl_mgperL            .           
## SO4_mgperL           .           
## NO3_mgperL           .           
## NH3_mgperL           .           
## TKN_mgperL           .           
## SRP_ugperL           .           
## TP_ugperL            .           
## Alk_mgperL           .           
## DO_mgperL            .           
## DO_percent           .           
## total_bac_abund      .           
## attached_bac         .           
## perc_attached_abund  .           
## fraction_bac_abund   .           
## PC1                  .           
## PC2                  .           
## Richness             .           
## Exponential_Shannon  .           
## Inverse_Simpson      .           
## Simpsons_Evenness    .           
## Unweighted_PD        .           
## Weighted_PD          .
## Now, run the most simple, parsimonious lasso model within 1 standard error  
lasso_divs_1se <- glmnet(x, y, alpha = 1, lambda = lasso_lambda_1se, standardize = FALSE)
# What are the lasso coefficients?
predict(lasso_divs_1se, type = "coefficients", s = lasso_lambda_1se) 
## 30 x 1 sparse Matrix of class "dgCMatrix"
##                                 1
## (Intercept)          2.872469e-15
## Sample_depth_m       .           
## Temp_C               .           
## SpCond_uSpercm       .           
## TDS_mgperL           .           
## pH                  -3.752725e-01
## ORP_mV               .           
## Chl_Lab_ugperL       .           
## Cl_mgperL            .           
## SO4_mgperL           .           
## NO3_mgperL           .           
## NH3_mgperL           .           
## TKN_mgperL           .           
## SRP_ugperL           .           
## TP_ugperL            .           
## Alk_mgperL           .           
## DO_mgperL            .           
## DO_percent           .           
## total_bac_abund      .           
## attached_bac         .           
## perc_attached_abund  .           
## fraction_bac_abund   .           
## PC1                  .           
## PC2                  .           
## Richness             .           
## Exponential_Shannon  .           
## Inverse_Simpson      .           
## Simpsons_Evenness    .           
## Unweighted_PD        .           
## Weighted_PD          .

pH is the only predictors of free-living per-capita production.

Community-wide Production: All Samples

# Set the seed for reproducibility of the grid values 
set.seed(777)      
   
################ PREPARE DATA ################ 
# Subset data needed and scale all o
scaled_comm_data_ALL <- 
  all_dat_lasso_comm %>%               # Use community-wide production data only for the all samples  
  scale() %>%                          # Scale all of the variables to have mean =0 and sd = 1
  as.data.frame()                     # Make it a dataframe so that model.matrix function works (does not take a matrix)

# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x <- model.matrix(frac_bacprod ~ ., scaled_comm_data_ALL)[,-1]
y <- scaled_comm_data_ALL$frac_bacprod
grid <- 10^seq(10,-2,length = 100)

################ PREPARE TRAINING & TESTING DATA FOR CROSS VALIDATION ################ 
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model 
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
y_test <- y[test]


################ LASSO ################ 
# Run lasso regression with alpha = 1
lasso_divs_train <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = FALSE)
par(mfrow = c(1,2))
plot(lasso_divs_train)
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique 'x' values
# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv_lasso_divs)

best_lasso_lambda <- cv_lasso_divs$lambda.min
# https://stats.stackexchange.com/questions/138569/why-is-lambda-plus-1-standard-error-a-recommended-value-for-lambda-in-an-elastic
lasso_lambda_1se <- cv_lasso_divs$lambda.1se # simplest model whose accuracy is comparable with the best model
best_lasso_lambda == lasso_lambda_1se
## [1] FALSE
best_lasso_lambda
## [1] 0.8130063
lasso_lambda_1se
## [1] 0.8517182
lasso_divs_pred <- predict(lasso_divs_train, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2)
## [1] 0.5694425
## Run lasso on the entire dataset with the best lambda value 
lasso_divs <- glmnet(x, y, alpha = 1, lambda = best_lasso_lambda, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)
## 30 x 1 sparse Matrix of class "dgCMatrix"
##                                1
## (Intercept)         2.081668e-17
## Sample_depth_m      0.000000e+00
## Temp_C              .           
## SpCond_uSpercm      .           
## TDS_mgperL          .           
## pH                  .           
## ORP_mV              .           
## Chl_Lab_ugperL      .           
## Cl_mgperL           .           
## SO4_mgperL          .           
## NO3_mgperL          .           
## NH3_mgperL          .           
## TKN_mgperL          .           
## SRP_ugperL          .           
## TP_ugperL           .           
## Alk_mgperL          .           
## DO_mgperL           .           
## DO_percent          .           
## total_bac_abund     .           
## attached_bac        .           
## perc_attached_abund .           
## fraction_bac_abund  .           
## PC1                 .           
## PC2                 .           
## Richness            .           
## Exponential_Shannon .           
## Inverse_Simpson     .           
## Simpsons_Evenness   .           
## Unweighted_PD       .           
## Weighted_PD         .
## Run lasso on the entire dataset with the best lambda value 
lasso_divs_1se <- glmnet(x, y, alpha = 1, lambda = lasso_lambda_1se, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(lasso_divs_1se, type = "coefficients", s = lasso_lambda_1se)
## 30 x 1 sparse Matrix of class "dgCMatrix"
##                                1
## (Intercept)         2.081668e-17
## Sample_depth_m      0.000000e+00
## Temp_C              .           
## SpCond_uSpercm      .           
## TDS_mgperL          .           
## pH                  .           
## ORP_mV              .           
## Chl_Lab_ugperL      .           
## Cl_mgperL           .           
## SO4_mgperL          .           
## NO3_mgperL          .           
## NH3_mgperL          .           
## TKN_mgperL          .           
## SRP_ugperL          .           
## TP_ugperL           .           
## Alk_mgperL          .           
## DO_mgperL           .           
## DO_percent          .           
## total_bac_abund     .           
## attached_bac        .           
## perc_attached_abund .           
## fraction_bac_abund  .           
## PC1                 .           
## PC2                 .           
## Richness            .           
## Exponential_Shannon .           
## Inverse_Simpson     .           
## Simpsons_Evenness   .           
## Unweighted_PD       .           
## Weighted_PD         .

As we can see in Figure 2 and Table S1, there is much less variation explained in this model by a linear regression. Here, the result of the “one-standard-error” approach shows that the lasso is unable to pick a variable. Therefore, no variables are chosen.

Per-capita Production: All Samples

set.seed(777) 
      
scaled_percapita_ALL <- all_dat_lasso_percapita %>%
  dplyr::filter(!is.na(fracprod_per_cell_noinf)) %>%
  mutate(log10_percell = log10(fracprod_per_cell_noinf)) %>%
  dplyr::select(-c(fracprod_per_cell_noinf, perc_attached_abund, fraction_bac_abund, attached_bac)) %>%
  scale() %>%
  as.data.frame()


# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x = model.matrix(log10_percell ~ ., scaled_percapita_ALL)[,-1]
y = scaled_percapita_ALL$log10_percell
grid = 10^seq(10,-2,length = 100)

# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model 
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train

################ LASSO
# Run lasso regression with alpha = 1
lasso_divs_train <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = FALSE)
par(mfrow = c(1,2))
plot(lasso_divs_train)
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique 'x' values
# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv_lasso_divs)

best_lasso_lambda <- cv_lasso_divs$lambda.min
# https://stats.stackexchange.com/questions/138569/why-is-lambda-plus-1-standard-error-a-recommended-value-for-lambda-in-an-elastic
lasso_lambda_1se <- cv_lasso_divs$lambda.1se # simplest model whose accuracy is comparable with the best model
best_lasso_lambda == lasso_lambda_1se
## [1] FALSE
lasso_divs_pred <- predict(lasso_divs_train, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2)
## Warning in lasso_divs_pred - y_test: longer object length is not a multiple of shorter object length
## Error in mean((lasso_divs_pred - y_test)^2): dims [product 11] do not match the length of object [12]
## Run lasso on the entire dataset 
lasso_divs <- glmnet(x, y, alpha = 1, lambda = best_lasso_lambda, standardize = FALSE)
# What are the lasso coefficients?
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)
## 27 x 1 sparse Matrix of class "dgCMatrix"
##                                 1
## (Intercept)          6.879745e-16
## Sample_depth_m       .           
## Temp_C               .           
## SpCond_uSpercm       .           
## TDS_mgperL           .           
## pH                  -1.291397e-01
## ORP_mV               .           
## Chl_Lab_ugperL       .           
## Cl_mgperL            .           
## SO4_mgperL           .           
## NO3_mgperL           .           
## NH3_mgperL           .           
## TKN_mgperL           .           
## SRP_ugperL           .           
## TP_ugperL            .           
## Alk_mgperL           .           
## DO_mgperL            .           
## DO_percent           .           
## total_bac_abund      .           
## PC1                  .           
## PC2                  .           
## Richness             4.458786e-01
## Exponential_Shannon  .           
## Inverse_Simpson      .           
## Simpsons_Evenness    .           
## Unweighted_PD       -1.172394e-01
## Weighted_PD          .
## Run lasso on the entire dataset 
lasso_divs_1se <- glmnet(x, y, alpha = 1, lambda = lasso_lambda_1se, standardize = FALSE)
# What are the lasso coefficients?
predict(lasso_divs_1se, type = "coefficients", s = lasso_lambda_1se)
## 27 x 1 sparse Matrix of class "dgCMatrix"
##                                 1
## (Intercept)         -3.170199e-17
## Sample_depth_m       .           
## Temp_C               .           
## SpCond_uSpercm       .           
## TDS_mgperL           .           
## pH                   .           
## ORP_mV               .           
## Chl_Lab_ugperL       .           
## Cl_mgperL            .           
## SO4_mgperL           .           
## NO3_mgperL           .           
## NH3_mgperL           .           
## TKN_mgperL           .           
## SRP_ugperL           .           
## TP_ugperL            .           
## Alk_mgperL           .           
## DO_mgperL            .           
## DO_percent           .           
## total_bac_abund      .           
## PC1                  .           
## PC2                  .           
## Richness             3.263384e-01
## Exponential_Shannon  .           
## Inverse_Simpson      .           
## Simpsons_Evenness    .           
## Unweighted_PD        .           
## Weighted_PD          .

As we can see in Figure S7 and Table S2, Richness is the best predictor of per-capita heterotrophic production when all samples are used in the model.

Supplemental Figures

Figure S1

Map of Muskegon Lake

# Load the ggmap package for plotting of Muskegon Lake
library(ggmap)

# Extract the coordinates for Muskegon Lake 
lakesite_coordinates <- read.csv("data/metadata/ML_GPS_Coordinates.csv") %>%
  dplyr::select(lakesite, Latitude, Longitude) %>%
  dplyr::rename(Lat = Latitude, Long = Longitude)

# Set the boundaries for the map 
map <- get_map(c(-86.35, 43.21, -86.235, 43.265),  #left/bottom/right/top
                 zoom = 13, maptype = "toner-background")

# Plot the Muskegon Lake Map 
ggmap(map) + xlab("Longitude") + ylab("Latitude") +
  theme(axis.text = element_text(size = 10, color = "black"),
        axis.title = element_text(size = 12, color = "black", face = "bold")) +
    geom_point(data = lakesite_coordinates, 
               aes(x=Long, y=Lat), color="red", size=4) +
    geom_text(data = lakesite_coordinates,
              aes(x=Long, y=Lat, label=lakesite), col="white", cex=4, vjust = 0.5,hjust = -0.25) 

Figure S2

Faith’s PD

surface_PAFL_otu_pruned_RAREFIED_rm2
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 1891 taxa and 24 samples ]
## sample_data() Sample Data:       [ 24 samples by 70 sample variables ]
## tax_table()   Taxonomy Table:    [ 1891 taxa by 8 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 1891 tips and 1889 internal nodes ]
## Calculate Faith's PD and species richness for sample 
faiths_pd_RAREFIED <- pd(comm_RAREFIED_rm2, phy_RAREFIED_rm2, include.root = FALSE)
faiths_pd_RAREFIED$norep_filter_name <- row.names(faiths_pd_RAREFIED)


# Join Faith's PD with the rest of the metadata 
meta_data_PD <- left_join(faiths_pd_RAREFIED, lasso_data_df, by = "norep_filter_name")

################ Figure S1: species richness and faith's PD
## 1. Run the linear model: Is there a correlation between species richness and faith's PD?
lm_PD_vs_SR <- lm(PD ~ SR, data = meta_data_PD)
summary(lm_PD_vs_SR)
## 
## Call:
## lm(formula = PD ~ SR, data = meta_data_PD)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.93991 -0.96092  0.03063  1.00013  2.60206 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.836131   0.986599   10.98 2.13e-10 ***
## SR           0.062600   0.002285   27.40  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.52 on 22 degrees of freedom
## Multiple R-squared:  0.9715, Adjusted R-squared:  0.9702 
## F-statistic: 750.9 on 1 and 22 DF,  p-value: < 2.2e-16
## 2. Extract the R2 and p-value from the linear model: 
lm_lab_PD_vs_SR<- paste("atop(R^2 ==", round(summary(lm_PD_vs_SR)$adj.r.squared, digits = 3), ",",
             "p ==", round(unname(summary(lm_PD_vs_SR)$coefficients[,4][2]), digits = 20), ")")

## 3. Plot Figure S1: species richness and faith's PD
ggplot(meta_data_PD, aes(y = PD, x = SR)) + 
  geom_point(size = 3) + 
  ylab("Faith's PD") + 
  xlab("Species Richness") +
  geom_smooth(method = "lm", color = "black") +
  # Add the R2 and p-value to the plot 
  annotate("text", x=600, y=35, label=lm_lab_PD_vs_SR, parse = TRUE, color = "black", size = 4) 

Figure S3

Correlations between Particle and Free of values in Figure 1

work_df <- metadata %>%
  dplyr::select(norep_filter_name, fraction, fraction_bac_abund, frac_bacprod, fracprod_per_cell_noinf) %>%
  mutate(norep_water_name = paste(substr(norep_filter_name, 1,4), substr(norep_filter_name, 6,9), sep = "")) %>%
  dplyr::select(-norep_filter_name)

part_work_df <- work_df %>%
  filter(fraction == "Particle") %>%
  rename(part_bacabund = fraction_bac_abund,
         part_prod = frac_bacprod, 
         part_percell_prod = fracprod_per_cell_noinf) %>%
  dplyr::select(-fraction)

free_work_df <- work_df %>%
  filter(fraction == "Free") %>%
  rename(free_bacabund = fraction_bac_abund,
         free_prod = frac_bacprod, 
         free_percell_prod = fracprod_per_cell_noinf) %>%
  dplyr::select(-fraction)

byfrac_df <- part_work_df %>%
  left_join(free_work_df, by = "norep_water_name")

byfrac_df$season <- substr(byfrac_df$norep_water_name, 5,5) # 7th letter = month sampled
byfrac_df$season <- as.character(byfrac_df$season)
byfrac_df$season <- ifelse(byfrac_df$season == "5", "Spring", 
                             ifelse(byfrac_df$season == "7", "Summer", 
                                    ifelse(byfrac_df$season == "9", "Fall",
                                           "NA")))
byfrac_df$season <- factor(byfrac_df$season, levels = c("Spring", "Summer", "Fall"))

summary(lm(log10(part_bacabund) ~ log10(free_bacabund), data = filter(byfrac_df, norep_water_name != "MOTE515")))
## 
## Call:
## lm(formula = log10(part_bacabund) ~ log10(free_bacabund), data = filter(byfrac_df, 
##     norep_water_name != "MOTE515"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.52690 -0.08048  0.05903  0.19565  0.35255 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)
## (Intercept)            2.0900     3.2247   0.648    0.533
## log10(free_bacabund)   0.4264     0.5532   0.771    0.461
## 
## Residual standard error: 0.3111 on 9 degrees of freedom
## Multiple R-squared:  0.06194,    Adjusted R-squared:  -0.04229 
## F-statistic: 0.5943 on 1 and 9 DF,  p-value: 0.4605
plot1 <- ggplot(filter(byfrac_df, norep_water_name != "MOTE515"), 
       aes(x = log10(free_bacabund), y = log10(part_bacabund))) +
  xlab("Free") +  ylab("Particle") + 
  ggtitle("Log10(Bacterial Counts) \n (cells/mL)") +
  geom_point(size = 3, fill = "grey", aes(shape = season), width = 0.2) + 
  scale_shape_manual(values = season_shapes) +
  theme(legend.position = "bottom",
        legend.title = element_blank())




################ Community-Wide production correlation between particle and free
## 1. Run the linear regression 
lm_prod_corr <- lm(part_prod ~ free_prod, data = byfrac_df)

# 2. Extract the R2 and p-value from the linear model: 
lm_lab_perliter_PAvsFL <- paste("atop(R^2 ==", round(summary(lm_prod_corr)$adj.r.squared, digits = 3), ",",
            "p ==", round(unname(summary(lm_prod_corr)$coefficients[,4][2]), digits = 3), ")")

## 3. Plot Community production correlation between particle and free
plot2 <- ggplot(byfrac_df, aes(x = free_prod, y = part_prod)) +
  xlab("Free") +  ylab("Particle") + 
  ggtitle("Community Production \n(μgC/L/day)") +
  geom_point(size = 3, fill = "grey", aes(shape = season), width = 0.2) + 
  scale_shape_manual(values = season_shapes) +
  geom_smooth(method = "lm", color = "black")  + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=20, y=30, label=lm_lab_perliter_PAvsFL, parse = TRUE, color = "black", size = 4) +
  theme(legend.position = "none")


################ Per-Capita production correlation between particle and free
## 1. Run the linear regression 
lm_percell_corr <- lm(log10(part_percell_prod) ~ log10(free_percell_prod), data = byfrac_df)

# 2. Extract the R2 and p-value from the linear model: 
lm_lab_percell_PAvsFL <- paste("atop(R^2 ==", round(summary(lm_percell_corr)$adj.r.squared, digits = 3), ",",
            "p ==", round(unname(summary(lm_percell_corr)$coefficients[,4][2]), digits = 3), ")")

## 3. Plot Per-Capita production correlation between particle and free
plot3 <- ggplot(byfrac_df,
       aes(x = log10(free_percell_prod), y = log10(part_percell_prod))) +  
  xlab("Free") +  ylab("Particle") + 
  ggtitle("log10(Per-Capita Production)\n (μgC/cell/day)") +
  geom_point(size = 3, fill = "grey", aes(shape = season), width = 0.2) + 
  scale_shape_manual(values = season_shapes) +
  geom_smooth(method = "lm", color = "black") + 
  # Add the R2 and p-value to the plot 
  annotate("text", y = -5.8, x=-7.9, label=lm_lab_percell_PAvsFL, parse = TRUE, color = "black", size = 4) +
  theme(legend.position = "none")


# Make the final multi-paneled Plot
# Extract the legend
legend_s1 <- get_legend(plot1)

# Make the 3 plots
top_row_S1 <- plot_grid(plot1 +theme(legend.position = "none"), plot2, plot3, 
          nrow = 1, ncol = 3, 
          labels = c("A", "B", "C"), 
          align = "h")

# Put the legend and 3 plots together
plot_grid(top_row_S1, legend_s1,
          rel_heights = c(1, 0.05),
          nrow = 2, ncol = 1)

Figure S4

Shannon and Simpson’s Evenness

######################################################### SHANNON ENTROPY ######################################################### 
shannon_fraction_wilcox <- wilcox.test(mean ~ fraction, data = shannon)
shannon_fraction_wilcox   
## 
##  Wilcoxon rank sum test
## 
## data:  mean by fraction
## W = 119, p-value = 0.00556
## alternative hypothesis: true location shift is not equal to 0
filter(shannon) %>%
  group_by(fraction) %>%
  summarize(mean(mean), sd(mean))
## # A tibble: 2 x 3
##   fraction `mean(mean)` `sd(mean)`
##   <fct>           <dbl>      <dbl>
## 1 Particle        109.        69.6
## 2 Free             55.9       17.0
# Make a data frame to draw significance line in boxplot (visually calculated)
shannon_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(5.8, 5.9, 5.9, 5.8)) # WholePart vs WholeFree

shannon_distribution_plot <- 
  ggplot(shannon, aes(y = mean, x = fraction)) +
  scale_fill_manual(values = fraction_colors) +
  geom_jitter(size = 3, aes(shape = season, fill = fraction), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
  scale_y_continuous(limits = c(3.5,6.1), breaks = seq(from = 3, to =6, by = 0.5)) + 
  xlab("Shannon Entropy") +
  xlab("Fraction") + 
  scale_shape_manual(values = season_shapes) +
    # Add line and pval
  geom_path(data = shannon_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
  annotate("text", x=1.5, y=6.1, size = 8, color = "#424645", label= paste("**"), angle = 90) +
  annotate("text", x=1.5, y=5.5, size = 4, color = "#424645",
           label= paste("p =", round(shannon_fraction_wilcox$p.value, digits = 3))) +
  theme(legend.position = "none",# axis.title.y = element_blank(),
        axis.title.x = element_blank(), axis.text.x = element_blank()) +
  coord_flip()

################ Shannon vs Community-wide (Per-Liter) Production 
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_perliter_shannon_PA <- paste("atop(R^2 ==", round(summary(lm_prod_vs_shannon_PA)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(summary(lm_prod_vs_shannon_PA)$coefficients[,4][2]), digits = 3), ")")

## 2. Plot Shannon vs Community-wide (Per-Liter) Production 
prod_vs_shannon_plot <-  
  ggplot(shannon, aes(x=mean, y=frac_bacprod)) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction)) +  # Y-axis errorbars
  geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
  scale_color_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  scale_fill_manual(values = fraction_colors) +
  ylab("\n Community Production \n (μgC/L/day)") + 
  scale_x_continuous(limits = c(3.5,6.1), breaks = seq(from = 3, to =6, by = 0.5)) + 
  xlab("Shannon Entropy") +
  geom_smooth(data=filter(shannon, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=5.25, y=45, label=lm_lab_perliter_shannon_PA, parse = TRUE, color = "#FF6600", size = 4) +
  theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank()) 


################ Shannon vs Per Capitra (Per-Cell) Production 
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_percell_shannon_PA <- paste("atop(R^2 ==", round(summary(lm_percell_prod_vs_shannon_PA)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(summary(lm_percell_prod_vs_shannon_PA)$coefficients[,4][2]), digits = 3), ")")

## 2. Plot Shannon vs Per Capitra (Per-Cell) Production 
percell_prod_vs_shannon_plot <- 
  ggplot(shannon, aes(x=mean, y=log10(fracprod_per_cell_noinf))) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  ylab("\n log10(Per-Capita Production)\n (μgC/cell/day)") +
  xlab("Shannon Entropy") +
  geom_smooth(data=filter(shannon, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = c(3.5,6.1), breaks = seq(from = 3, to =6, by = 0.5)) + 
  #scale_y_continuous(limits = c(-8e-7,5e-6), breaks = seq(from = 0, to = 6e-7, by = 3e-7)) + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=5.25, y=-7.5, label=lm_lab_percell_shannon_PA, parse = TRUE, color = "#FF6600", size = 4) +
  theme(legend.title = element_blank(), legend.position = "none")



shannon_plots <- plot_grid(shannon_distribution_plot, prod_vs_shannon_plot, percell_prod_vs_shannon_plot,
          labels = c("A", "C", "E"), ncol = 1, nrow = 3,
          rel_heights = c(0.5, 0.85, 1.1),
          align = "v")


######################################################### SIMPSON'S EVENNESS ######################################################### 
simpseven_fraction_wilcox <- wilcox.test(mean ~ fraction, data = simpseven)
simpseven_fraction_wilcox
## 
##  Wilcoxon rank sum test
## 
## data:  mean by fraction
## W = 55, p-value = 0.3474
## alternative hypothesis: true location shift is not equal to 0
filter(simpseven) %>%
  group_by(fraction) %>%
  summarize(mean(mean), sd(mean))
## # A tibble: 2 x 3
##   fraction `mean(mean)` `sd(mean)`
##   <fct>           <dbl>      <dbl>
## 1 Particle       0.0590     0.0256
## 2 Free           0.0714     0.0171
# Make a data frame to draw significance line in boxplot (visually calculated)
simpseven_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(0.14, 0.15, 0.15, 0.14)) # WholePart vs WholeFree

simpseven_distribution_plot <- 
  ggplot(simpseven, aes(y = mean, x = fraction)) +
  scale_fill_manual(values = fraction_colors) +
  geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
  scale_y_continuous(limits = c(0,0.15), breaks = seq(from = 0, to = 0.15, by = 0.03)) + 
  ylab("Simpson's Evenness") +
  scale_shape_manual(values = season_shapes) +
  xlab("Fraction") + 
  geom_path(data = simpseven_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
  annotate("text", x=1.5, y=0.14, size = 4, color = "#424645", label= "NS") +
  theme(legend.position = "none",
        axis.title.x = element_blank(), axis.text.x = element_blank()) +
  coord_flip()




################ Simpson's Evenness vs Community-wide (Per-Liter) Production 
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_perliter_simpseven_PA <- paste("atop(R^2 ==", round(summary(lm_prod_vs_simpseven_PA)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(summary(lm_prod_vs_simpseven_PA)$coefficients[,4][2]), digits = 3), ")")

## 2. Plot Simpson's Evenness vs Community-wide (Per-Liter) Production 
prod_vs_simpseven_plot <-  
  ggplot(simpseven, aes(x=mean, y=frac_bacprod)) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction)) +  # Y-axis errorbars
  geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
  scale_color_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  scale_fill_manual(values = fraction_colors) +
  ylab("\n Community Production \n (μgC/L/day)") + 
  ylab("Simpson's Evenness") +
  geom_smooth(data=filter(simpseven, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = c(0,0.15), breaks = seq(from = 0, to = 0.15, by = 0.03)) + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=0.13, y=15, label=lm_lab_perliter_simpseven_PA, parse = TRUE, color = "#FF6600", size = 4) +
  theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank()) 



################ Simpson's Evenness vs Per Capitra (Per-Cell) Production 
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_percell_simpseven_PA <- paste("atop(R^2 ==", round(summary(lm_percell_prod_vs_simpseven_PA)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(summary(lm_percell_prod_vs_simpseven_PA)$coefficients[,4][2]), digits = 3), ")")

## 2. Plot Simpson's Evenness vs Per Capitra (Per-Cell) Production 
percell_prod_vs_simpseven_plot <- 
  ggplot(simpseven, aes(x=mean, y=log10(fracprod_per_cell_noinf))) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
  scale_color_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  scale_fill_manual(values = fraction_colors) +
  ylab("\n log10(Per-Capita Production)\n (μgC/cell/day)") +
  xlab("Simpson's Evenness") +
  geom_smooth(data=filter(simpseven, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = c(0,0.15), breaks = seq(from = 0, to = 0.15, by = 0.03)) + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=0.13, y=-6.3, label=lm_lab_percell_simpseven_PA, parse = TRUE, color = "#FF6600", size = 4) +
  theme(legend.title = element_blank(), legend.position ="none")


simpseven_plots <- plot_grid(simpseven_distribution_plot, prod_vs_simpseven_plot, percell_prod_vs_simpseven_plot,
          labels = c("B", "D", "F"), ncol = 1, nrow = 3,
          rel_heights = c(0.5, 0.85, 1.1),
          align = "v")


#### Remove the axis labels for prettier multipaneled plotting
simpseven_plots_noyaxis <- 
  plot_grid(simpseven_distribution_plot + theme(axis.title.y = element_blank(), axis.text.y = element_blank()), 
            prod_vs_simpseven_plot + theme(axis.title.y = element_blank()), 
            percell_prod_vs_simpseven_plot + theme(axis.title.y = element_blank()), 
          labels = c("B", "D", "F"), ncol = 1, nrow = 3,
          rel_heights = c(0.5, 0.8, 1.1),
          align = "v")

figS2_row1 <- plot_grid(shannon_plots, simpseven_plots_noyaxis,
          ncol = 2, nrow = 1, rel_widths = c(1, 0.75),
          align = "h")

###### PLOT FIGURE S5
plot_grid(figS2_row1, season_legend,
          ncol = 1, nrow = 2, rel_heights = c(1, 0.05))

Figure S5

Compare Diversity Estimates across Station and Season

long_div_df <- lasso_data_df %>%  
  dplyr::select(norep_filter_name, lakesite:season, Richness:Weighted_PD) %>%
  gather(measure, mean, Richness:Weighted_PD) %>%
  mutate(measure = factor(measure, levels = 
                            c("Richness", "Shannon_Entropy", "Inverse_Simpson", "Simpsons_Evenness", "Unweighted_PD", "Weighted_PD")))


divs_PAFLA_plot_lakesite <- ggplot(long_div_df, aes(y = mean, x = lakesite, shape = lakesite)) +
  ylab("Mean Diversity Value\n By Lake Station") +
  facet_wrap(~measure, scale = "free_y", ncol = 6) +
    geom_point(size = 3, aes(fill = fraction), color = "black", position = position_jitterdodge()) +
  geom_boxplot(alpha = 0.5, outlier.shape = NA, color = "black", aes(fill = fraction)) +
  scale_fill_manual(values = fraction_colors) + 
  scale_shape_manual(values = lakesite_shapes) + 
  scale_color_manual(values = fraction_colors) + 
  theme(legend.position = "none", axis.title.x = element_blank(), 
        axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1)) 

plot_A <- plot_grid(divs_PAFLA_plot_lakesite, lakesite_legend,
          nrow = 2, ncol = 1,  labels = c("A",""),
          rel_heights = c(1, 0.05))


divs_PAFLA_plot_season <- ggplot(long_div_df, aes(y = mean, x = season, shape = season)) +
  ylab("Mean Diversity Value \n By Season") +
  facet_wrap(~measure, scale = "free_y", ncol = 6) +
   geom_point(size = 3, aes(fill = fraction), color = "black", position = position_jitterdodge()) +
  geom_boxplot(alpha = 0.5, outlier.shape = NA, color = "black", aes(fill = fraction)) +
  scale_fill_manual(values = fraction_colors) + 
  scale_shape_manual(values = season_shapes) + 
  scale_color_manual(values = fraction_colors) + 
  theme(legend.position = "none", axis.title.x = element_blank(), 
        axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1)) 

plot_B <- plot_grid(divs_PAFLA_plot_season, season_legend,
          nrow = 2, ncol = 1, labels = c("B",""),
          rel_heights = c(1, 0.05))

###### PLOT FIGURE S6
plot_grid(plot_A, plot_B,
          nrow = 2, ncol = 1)

Figure S6

See analysis/OTU_Removal_Analysis* files on the GitHub Repo

Figure S7

Per capita production combined models

################ Figure S7A: Per-cell Heterotrophic Production vs All Observed Richness
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_overall_rich_vs_percell <- 
  paste("atop(R^2 ==", round(summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = richness))$adj.r.squared, digits = 2), ",",
  "p ==", round(unname(summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = richness))$coefficients[,4][2]), digits = 6), ")")

## 2.  Figure S7A: Per-cell Heterotrophic Production vs All Observed Richness
plot_all_rich_percell <- 
  ggplot(richness, aes(x=mean, y=log10(fracprod_per_cell_noinf))) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd), color = "grey", alpha = 0.7) + # X-axis errorbars
  geom_point(size = 3.5, aes(shape = season), color = "black", fill =  "grey") +
  scale_shape_manual(values = season_shapes) + 
  ylab("\n log10(Per-Capita Production)\n (μgC/cell/day)") + xlab("Observed Richness") +
  geom_smooth(method='lm', color = "#424645", fill = "#424645") + 
  scale_x_continuous(limits = c(150,950), breaks = seq(from = 0, to =925, by = 150)) + 
  scale_y_continuous(limits = c(-8.5, -5.4), breaks = seq(from = -9, to = -5, by = 1)) + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=790, y=-8, label=lm_lab_overall_rich_vs_percell, parse = TRUE, color = "#424645", size = 4) +
  theme(legend.title = element_blank(), legend.position ="bottom", 
        legend.text = element_text(size = 14))


################ Figure S7B: Per-cell Heterotrophic Production vs All shannon
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_overall_shannon_vs_percell <- 
  paste("atop(R^2 ==", round(summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = shannon))$adj.r.squared, digits = 2), ",",
  "p ==", round(unname(summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = shannon))$coefficients[,4][2]), digits = 5), ")")

## 2.  Figure S7B: Per-cell Heterotrophic Production vs All Shannon
plot_all_shannon_percell <- 
  ggplot(shannon, aes(x=mean, y=log10(fracprod_per_cell_noinf))) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd), color = "grey", alpha = 0.7) + # X-axis errorbars
  geom_point(size = 3.5, aes(shape = season), color = "black", fill =  "grey") +
  scale_shape_manual(values = season_shapes) + 
  ylab("log10(production/cell)\n (μgC/cell/day)") + xlab("Shannon Entropy") +
  geom_smooth(method='lm', color = "#424645", fill = "#424645") + 
  scale_x_continuous(limits = c(3.5,6), breaks = seq(from = 3, to =6, by = 0.5)) + 
  scale_y_continuous(limits = c(-8.5, -5.4), breaks = seq(from = -9, to = -5, by = 1)) + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=5.25, y=-8, label=lm_lab_overall_shannon_vs_percell, parse = TRUE, color = "#424645", size = 4) +
  theme(legend.title = element_blank(), legend.position ="bottom", 
        legend.text = element_text(size = 14))

################ Figure S7C: Per-cell Heterotrophic Production vs All Inverse Simpson
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_overall_invsimps_vs_percell <- 
  paste("atop(R^2 ==", round(summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = invsimps))$adj.r.squared, digits = 2), ",",
  "p ==", round(unname(summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = invsimps))$coefficients[,4][2]), digits = 5), ")")

## 2.  Figure S7C: Per-cell Heterotrophic Production vs All Inverse Simpson
plot_all_invsimps_percell <- 
  ggplot(invsimps, aes(x=mean, y=log10(fracprod_per_cell_noinf))) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd), color = "grey", alpha = 0.7) + # X-axis errorbars
  geom_point(size = 3.5, aes(shape = season), color = "black", fill =  "grey") +
  scale_shape_manual(values = season_shapes) + 
  ylab("log10(production/cell)\n (μgC/cell/day)") + xlab("Inverse Simpson") +
  geom_smooth(method='lm', color = "#424645", fill = "#424645") + 
  scale_x_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) + 
  scale_y_continuous(limits = c(-8.5, -5.4), breaks = seq(from = -9, to = -5, by = 1)) + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=70, y=-8, label=lm_lab_overall_invsimps_vs_percell, parse = TRUE, color = "#424645", size = 4) +
  theme(legend.title = element_blank(), legend.position ="bottom", 
        legend.text = element_text(size = 14))

################ Figure S7D: Per-cell Heterotrophic Production vs All Simpson's Evenness
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_overall_simpseven_vs_percell <- 
  paste("atop(R^2 ==", round(summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = simpseven))$adj.r.squared, digits = 2), ",",
  "p ==", round(unname(summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = simpseven))$coefficients[,4][2]), digits = 2), ")")

## 2.  Figure S7D: Per-cell Heterotrophic Production vs All Inverse Simpson
plot_all_simpseven_percell <- 
  ggplot(simpseven, aes(x=mean, y=log10(fracprod_per_cell_noinf))) + 
  geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd), color = "grey", alpha = 0.7) + # X-axis errorbars
  geom_point(size = 3.5, aes(shape = season), color = "black", fill =  "grey") +
  scale_shape_manual(values = season_shapes) + 
  ylab("log10(production/cell)\n (μgC/cell/day)") + xlab("Simpson's Evenness") +
  geom_smooth(method='lm', color = "#424645", fill = "#424645") + 
  scale_x_continuous(limits = c(0,0.15), breaks = seq(from = 0, to = 0.15, by = 0.03)) + 
  scale_y_continuous(limits = c(-8.5, -5.4), breaks = seq(from = -9, to = -5, by = 1)) + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=0.12, y=-8, label=lm_lab_overall_simpseven_vs_percell, parse = TRUE, color = "#424645", size = 4) +
  theme(legend.title = element_blank(), legend.position ="bottom", 
        legend.text = element_text(size = 14))


figs3_row1 <- plot_grid(plot_all_rich_percell + theme(legend.position = "none"), 
          plot_all_shannon_percell + theme(axis.title.y = element_blank(), legend.position = "none"), 
          plot_all_invsimps_percell + theme(axis.title.y = element_blank(), legend.position = "none"), 
          plot_all_simpseven_percell + theme(axis.title.y = element_blank(), legend.position = "none"),
          align = "h", labels = c("A", "B", "C", "D"),
          rel_widths = c(1.05, 0.9, 0.9, 0.9),
          nrow = 1, ncol = 4)

######## FIGURE S7
# legend
legend_grey <- get_legend(plot_all_rich_percell)


plot_grid(figs3_row1, legend_grey,
                   ncol = 1, nrow = 2, 
                   rel_heights = c(1, 0.05))

Figure S8

Weighted ses.mpd

######################################################### SES MPD DISTRIBUTION: WEIGHTED 
weighted_fraction_wilcox <- wilcox.test(mpd.obs.z ~ fraction, data = weighted_df)
weighted_fraction_wilcox
## 
##  Wilcoxon rank sum test
## 
## data:  mpd.obs.z by fraction
## W = 79, p-value = 0.7125
## alternative hypothesis: true location shift is not equal to 0
filter(weighted_df) %>%
  group_by(fraction) %>%
  summarize(mean(mpd.obs.z))
## # A tibble: 2 x 2
##   fraction `mean(mpd.obs.z)`
##   <fct>                <dbl>
## 1 Particle            -0.359
## 2 Free                -0.387
# Make a data frame to draw significance line in boxplot (visually calculated)
dat5 <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(1.6,1.7,1.7,1.6)) # WholePart vs WholeFree

weight_distribution_plot <- 
  ggplot(weighted_df, aes(y = mpd.obs.z, x = fraction)) +
  #scale_color_manual(values = fraction_colors) + 
  scale_fill_manual(values = fraction_colors) +
  geom_jitter(size = 3.5, aes(fill = fraction, shape = season), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
  scale_shape_manual(values = season_shapes) + 
  scale_y_continuous(limits = c(-1.5,1.75), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) + 
  ylab("Standardized Effect Size \n Unweighted Mean Pairwise Dist") +
  xlab("Fraction") + 
  geom_hline(yintercept = 0, linetype = "dashed", size = 1.5) +
  ##### Particle vs free per-cell production 
  geom_path(data = dat5, aes(x = a, y = b), linetype = 1, color = "#424645") +
  annotate("text", x=1.5, y=1.55, fontface = "bold",  size = 3.5, color = "#424645", label= "NS") +
  theme(legend.position = "none", #axis.title.y = element_blank(),
        axis.title.x = element_blank(), axis.text.x = element_blank()) +
  coord_flip()


# Is there a relationship between inverse simpson and Weighted Mean Pairwise distance?
weight_invsimps_vs_mpd_plot <- 
  ggplot(weighted_df, aes(y = mean, x = mpd.obs.z, fill = fraction)) +
  geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_vline(xintercept = 0, linetype = "dashed", size = 1.5) +
  geom_point(size = 3.5, aes(shape = season)) + ylab("Inverse Simpson") +
  scale_shape_manual(values = season_shapes) + 
  #xlab("Standardized Effect Size \n Weighted Mean Pairwise Dist") +
  scale_fill_manual(values = fraction_colors) +
  scale_x_continuous(limits = c(-1.5,1.75), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) + 
  theme(legend.title = element_blank(), legend.position = "none",
        axis.text.x = element_blank(), axis.title.x = element_blank())




# Is there a relationship between PER-LITER PRODUCTION and WEIGHTED Mean Pairwise distance?
weight_prod_vs_mpd_plot <- 
  ggplot(weighted_df, aes(y = frac_bacprod, x = mpd.obs.z, fill = fraction)) +
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_vline(xintercept = 0, linetype = "dashed", size = 1.5) +
  geom_point(size = 3.5, aes(shape = season)) + 
  scale_shape_manual(values = season_shapes) + 
  ylab("\n Community Production \n (μgC/L/day)") + 
  xlab("Standardized Effect Size \n Weighted Mean Pairwise Dist") +
  scale_fill_manual(values = fraction_colors) +
  scale_color_manual(values = fraction_colors) +
  scale_x_continuous(limits = c(-1.5,1.75), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) + 
  theme(legend.title = element_blank(), legend.position = "none",
        axis.text.x = element_blank(), axis.title.x = element_blank())



################ Weighted Phylogenetic Diversity vs Per Capita (Per-Cell) Production 
weight_percell_vs_mpd_plot <- 
  ggplot(weighted_df, 
       aes(y = log10(fracprod_per_cell_noinf), x = mpd.obs.z, fill = fraction)) +
  geom_vline(xintercept = 0, linetype = "dashed", size = 1.5) +
  geom_point(size = 3.5, aes(shape = season)) + 
  scale_shape_manual(values = season_shapes) + 
  ylab("\n log10(Per-Capita Production)\n (μgC/cell/day)") +
  xlab("Weighted Phylogenetic Diversity") +
  scale_fill_manual(values = fraction_colors) +
  scale_x_continuous(limits = c(-1.5,1.75), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) + 
  scale_y_continuous(limits = c(-8.5,-5), breaks = c(-8, -7, -6)) + 
  theme(legend.title = element_blank(), legend.position ="none", 
        legend.text = element_text(size = 14))


row1_weighted_PD_plot <- plot_grid(weight_distribution_plot, weight_invsimps_vs_mpd_plot, weight_prod_vs_mpd_plot, weight_percell_vs_mpd_plot, 
          labels = c("A", "B", "C", "D"), ncol = 1, nrow = 4,
          rel_heights = c(0.5, 0.8, 0.8, 1.2),
          align = "v")

### PLOT FIGURE S8
plot_grid(row1_weighted_PD_plot, season_legend,
          ncol = 1, nrow = 2,
          rel_heights = c(1, 0.05))

Figure S9

Randomized Richness

#########################################################  Subset only richness data 
### These are the richness values for the fake samples 
#rich_stats <- filter(otu_alphadiv, measure == "Richness") %>%
#  dplyr::select(1:2) %>%
#  rename(mean_richness = mean) %>%
#  mutate(sample = paste("Sample_", seq(1:nrow(filter(otu_alphadiv, measure == "Richness"))), sep = ""),
#         mean_richness = matround(mean_richness))

## Pick OTUs to match these richness values 

  # List the otus from ALL samples 
#  all_otus <- taxa_names(surface_PAFL_otu_pruned_rm2)
  
  # Obtain the OTU table from the phyloseq object
#  otutab <- otu_table(surface_PAFL_otu_pruned_rm2)
  # Make all the counts to be 0
#  otutab_newvals <- apply(otutab, c(1, 2), function(x) 0)

  # Stop if things are wrong 
#  stopifnot(colnames(otutab_newvals) == all_otus)                       # Do the OTU names match?
#  stopifnot(rownames(otutab_newvals) == rich_stats$norep_filter_name)   # Do the sample names match?
  
# Make it reproducible!   
#set.seed(777)
  
#for (row in 1:nrow(rich_stats)) {
  
  # Pick the richness value 
#  rich_val <- rich_stats[row, 2]  
  
  # Sample the OTUs to represent that richness value 
#  col_index <- sample(ncol(otutab_newvals), rich_val, replace = FALSE, prob = NULL)
  
  # make all other columns 0
#  otutab_newvals[row, col_index] <- 1

#}


## Calculate the tree for those randomized samples 
# create a new phyloseq object 
#random_physeq_presab_raw <- phyloseq(otu_table(otutab_newvals, taxa_are_rows = FALSE), 
#                                 tax_table(surface_PAFL_otu_pruned_rm2), sample_data(surface_PAFL_otu_pruned_rm2))
#random_physeq_presab_raw

# Remove taxa that are 0!
#random_physeq_presab_pruned <- prune_taxa(taxa_sums(random_physeq_presab_raw) > 0, random_physeq_presab_raw) 
#random_physeq_presab_pruned

# Calculate tree 
# Obtain the OTU names that were retained in the dataset
#tax <- data.frame(tax_table(random_physeq_presab_pruned))
#vector_of_otus <- as.vector(tax$OTU)

# Write out the file for processing/fasttree
# write(vector_of_otus, file = "data/PhyloTree/randomized/random_physeq_presab_pruned_OTUnames.txt", ncolumns = 1, append = FALSE, sep = "\n")
# Read in the tree
randomized_tree <- read.tree(file = "data/PhyloTree/randomized/newick_tree_randomized_rmN.tre")
  
  #random_physeq_presab_pruned_tree <- merge_phyloseq(random_physeq_presab_pruned, randomized_tree)
  #random_physeq_presab_pruned_tree
#save(list="random_physeq_presab_pruned_tree", file=paste0("data/PhyloTree/randomized/random_physeq_presab_pruned_tree.RData")) 

load("data/PhyloTree/randomized/random_physeq_presab_pruned_tree.RData")
random_physeq_presab_pruned_tree
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2911 taxa and 24 samples ]
## sample_data() Sample Data:       [ 24 samples by 44 sample variables ]
## tax_table()   Taxonomy Table:    [ 2911 taxa by 8 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 2911 tips and 2909 internal nodes ]
# First force the OTU 
randomized_otu <- matrix(otu_table(random_physeq_presab_pruned_tree), nrow = nrow(otu_table(random_physeq_presab_pruned_tree)))
rownames(randomized_otu) <- sample_names(random_physeq_presab_pruned_tree)
colnames(randomized_otu) <- taxa_names(random_physeq_presab_pruned_tree)
    
  
## Calculate input for SES_MPD  
# Convert the presence/absence data to standardized abundanced  vegan function `decostand' , NOTE: method = "pa"
otu_decostand <- decostand(randomized_otu, method = "pa")
# check total abundance in each sample
apply(otu_decostand, 1, sum)
## MBREJ515 MBREJ715 MBREJ915 MBREK515 MBREK715 MBREK915 MDPEJ515 MDPEJ715 MDPEJ915 MDPEK515 MDPEK715 MDPEK915 MINEJ515 MINEJ715 MINEJ915 MINEK515 MINEK715 MINEK915 MOTEJ515 MOTEJ715 MOTEJ915 MOTEK515 MOTEK715 MOTEK915 
##      906      574      434      268      256      358      493      447      476      276      284      381      840      632      586      452      383      511      505      343      444      274      238      381
# check for mismatches/missing species between community data and phylo tree
randomized_matches <- match.phylo.comm(randomized_tree, otu_decostand)
# the resulting object is a list with $phy and $comm elements.  replace our
# original data with the sorted/matched data
phy_randomized_rm2 <- randomized_matches$phy
comm_randomized_rm2 <- randomized_matches$comm

# Calculate the phylogenetic distances
phy_dist_randomized_rm2 <- cophenetic(phy_randomized_rm2)

  
## Calculate SES_MPD
###################################### INDEPENDENT SWAP ############################################
# calculate standardized effect size mean pairwise distance (ses.mpd)
unweighted_sesMPD_indepswap_randomized <- ses.mpd(comm_randomized_rm2, phy_dist_randomized_rm2, null.model = "independentswap", 
                                     abundance.weighted = FALSE, runs = 999)

df <- unweighted_sesMPD_indepswap_randomized

df$names <- row.names(df)
df_2 <- make_metadata_norep(df) %>%
  mutate(fraction = fct_recode(fraction, Particle = "WholePart", Free = "WholeFree"))
  
  
# Is there a relationship between richness and Unweighted Mean Pairwise distance?
summary(lm(ntaxa ~ mpd.obs.z, data = df_2))
## 
## Call:
## lm(formula = ntaxa ~ mpd.obs.z, data = df_2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -214.61 -128.21  -13.53   56.63  465.39 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   448.65      35.29  12.714  1.3e-11 ***
## mpd.obs.z      75.24      95.94   0.784    0.441    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 172.8 on 22 degrees of freedom
## Multiple R-squared:  0.02719,    Adjusted R-squared:  -0.01703 
## F-statistic: 0.615 on 1 and 22 DF,  p-value: 0.4413
summary(lm(ntaxa ~ mpd.obs.z, data = filter(df_2, fraction == "Particle")))
## 
## Call:
## lm(formula = ntaxa ~ mpd.obs.z, data = filter(df_2, fraction == 
##     "Particle"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -208.89 -109.98  -51.44   39.09  346.60 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   557.41      50.97  10.935 6.96e-07 ***
## mpd.obs.z     -18.67     153.80  -0.121    0.906    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 175.3 on 10 degrees of freedom
## Multiple R-squared:  0.001471,   Adjusted R-squared:  -0.09838 
## F-statistic: 0.01474 on 1 and 10 DF,  p-value: 0.9058
summary(lm(ntaxa ~ mpd.obs.z, data = filter(df_2, fraction == "Free")))
## 
## Call:
## lm(formula = ntaxa ~ mpd.obs.z, data = filter(df_2, fraction == 
##     "Free"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -108.70  -51.56  -17.29   57.16  160.49 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   343.12      25.00  13.727 8.18e-08 ***
## mpd.obs.z      68.10      62.35   1.092      0.3    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 85.34 on 10 degrees of freedom
## Multiple R-squared:  0.1066, Adjusted R-squared:  0.01726 
## F-statistic: 1.193 on 1 and 10 DF,  p-value: 0.3003
randomized_rich_plot <- ggplot(df_2, aes(y = ntaxa, x = mpd.obs.z, fill = fraction)) +
  geom_vline(xintercept = 0, linetype = "dashed", size = 1.5) +
  geom_point(size = 3.5, aes(shape = season)) + ylab("Randomized Richness") +
  xlab("Unweighted Phylogenetic Diversity") +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) + 
  scale_x_continuous(limits = c(-1,1)) + 
  theme(legend.title = element_blank(), legend.position = "none")


plot_grid(randomized_rich_plot, season_legend,
          rel_heights = c(1, 0.05), nrow = 2, ncol = 1)

Figure S10

PCA with Euclidian Distances

# Make the Supplemental Figure
par(mar = c(4.5, 4.5, 1, 1))
biplot(pca_environ,
     xlab = paste("PC1", paste(round(summary(pca_environ)$cont$importance[2,1]*100, digits = 2), "%", sep = ""), sep = ": "),
     ylab = paste("PC2", paste(round(summary(pca_environ)$cont$importance[2,2]*100, digits = 2), "%", sep = ""), sep = ": "))

par(mar = c(5,5,2,5))
plot(summary(pca_environ)$cont$importance[2,]*100, 
     xlab = "PCA Axis", 
     ylab = "Variation Explained Per Axis",
     ylim = c(0, 105),
     col = "cornflowerblue",
     cex =2,
     pch = 16)
par(new = T)
plot(summary(pca_environ)$cont$importance[3,]*100,
       cex = 2,
       pch = 17,
       ylim = c(0, 105),
       col = "firebrick3",
       axes=F, 
       xlab=NA, 
       ylab=NA)
axis(side = 4)
mtext(side = 4, line = 3, "Total Accumulated Variation")
legend("right",
       legend=c("Per Axis", "Total"),
       pch=c(16, 17), col=c("cornflowerblue", "firebrick3"))

# Amount of variation explained by the first two axes
summary(pca_environ)$cont$importance[3, 2]*100
## [1] 69.27135

The first two axes of the PCA explain ~70% of the variation in the environmental data!

Prepare data frames for table S1 and S2

# Add the phylogenetic diversity to the dataframes
weighted_PD_df <- weighted_df %>%
  dplyr::select(norep_filter_name, mpd.obs.z) %>%
  rename(weighted_PD = mpd.obs.z)

unweighted_PD_df <- unweighted_df %>%
  dplyr::select(norep_filter_name, mpd.obs.z) %>%
  rename(unweighted_PD = mpd.obs.z)

metadata_pca_PD <- metadata_pca %>%
  left_join(unweighted_PD_df, by = "norep_filter_name") %>%
  left_join(weighted_PD_df, by = "norep_filter_name")
  

## Prepare the dataframes for Free and Particle data
metadata_pca_PD_free <- filter(metadata_pca_PD, fraction == "Free")
metadata_pca_PD_part <- filter(metadata_pca_PD, fraction == "Particle")  

# List of environmental variables to run linear models on 
        # BGA_cellspermL and Turb_NTU do not have enough data points for regression
lm_variables <- c("frac_bacprod", "fracprod_per_cell_noinf", "Temp_C", "SpCond_uSpercm", "TDS_mgperL", 
                  "pH", "ORP_mV", "Chl_Lab_ugperL", "Cl_mgperL", "SO4_mgperL", "NO3_mgperL", 
                  "NH3_mgperL", "TKN_mgperL", "SRP_ugperL", "TP_ugperL", "Alk_mgperL", "DO_mgperL", 
                  "total_bac_abund","attached_bac", "dnaconcrep1", 
                  "PC1", "PC2", "PC3" ,"PC4", "PC5", "PC6",
                  "unweighted_PD", "weighted_PD") 

########################################################
##############  All Samples: Community-wide 
lm_summary_all_comm <- metadata_pca_PD %>%
  dplyr::select(one_of(lm_variables), -fracprod_per_cell_noinf) %>%
  gather(key = independent, value = measurement, -frac_bacprod) %>%
  mutate(measurement = as.numeric(measurement)) %>%
  group_by(independent) %>% 
  do(glance(lm(frac_bacprod ~ measurement, data = .))) %>% 
  mutate(fraction = "All Samples", dependent = "Community Production") %>%
  dplyr::select(fraction, dependent, independent, AIC, adj.r.squared, p.value) %>%
  ungroup() %>%
  mutate(FDR.p = p.adjust(p.value, method = "fdr")) 

##############  All Samples: Per-capita  
lm_summary_all_percap <- metadata_pca_PD %>%
  dplyr::select(one_of(lm_variables), -frac_bacprod) %>%
  filter(!is.na(fracprod_per_cell_noinf)) %>%   # Remove samples that are not NA
  gather(key = independent, value = measurement, -fracprod_per_cell_noinf) %>%
  mutate(measurement = as.numeric(measurement)) %>%
  group_by(independent) %>% 
  do(glance(lm(log10(fracprod_per_cell_noinf) ~ measurement, data = .))) %>% 
  mutate(fraction = "All Samples", dependent = "Per-Capita Production") %>%
  dplyr::select(fraction, dependent, independent, AIC, adj.r.squared, p.value) %>%
  ungroup() %>%
  mutate(FDR.p = p.adjust(p.value, method = "fdr")) 



########################################################
##############  Particle: Community-wide 
lm_summary_particle_comm <- metadata_pca_PD_part %>%
  dplyr::select(one_of(lm_variables), -fracprod_per_cell_noinf) %>%
  gather(key = independent, value = measurement, -frac_bacprod) %>%
  mutate(measurement = as.numeric(measurement)) %>%
  group_by(independent) %>% 
  do(glance(lm(frac_bacprod ~ measurement, data = .))) %>% 
  mutate(fraction = "Particle", dependent = "Community Production") %>%
  dplyr::select(fraction, dependent, independent, AIC, adj.r.squared, p.value) %>%
  ungroup() %>%
  mutate(FDR.p = p.adjust(p.value, method = "fdr")) 

##############  Particle: Per-capita  
lm_summary_particle_percap <- metadata_pca_PD_part %>%
  dplyr::select(one_of(lm_variables), -frac_bacprod) %>%
  filter(!is.na(fracprod_per_cell_noinf)) %>%   # Remove samples that are not NA
  gather(key = independent, value = measurement, -fracprod_per_cell_noinf) %>%
  mutate(measurement = as.numeric(measurement)) %>%
  group_by(independent) %>% 
  do(glance(lm(log10(fracprod_per_cell_noinf) ~ measurement, data = .))) %>% 
  mutate(fraction = "Particle", dependent = "Per-Capita Production") %>%
  dplyr::select(fraction, dependent, independent, AIC, adj.r.squared, p.value) %>%
  ungroup() %>%
  mutate(FDR.p = p.adjust(p.value, method = "fdr")) 


########################################################
##############  Free: Community-wide 
lm_summary_free_comm <- metadata_pca_PD_free %>%
  dplyr::select(one_of(lm_variables), -fracprod_per_cell_noinf) %>%
  gather(key = independent, value = measurement, -frac_bacprod) %>%
  mutate(measurement = as.numeric(measurement)) %>%
  group_by(independent) %>% 
  do(glance(lm(frac_bacprod ~ measurement, data = .))) %>% 
  mutate(fraction = "Free", dependent = "Community Production") %>%
  dplyr::select(fraction, dependent, independent, AIC, adj.r.squared, p.value) %>%
  ungroup() %>%
  mutate(FDR.p = p.adjust(p.value, method = "fdr")) 

##############  Free: Per-capita  
lm_summary_free_percap <- metadata_pca_PD_free %>%
  dplyr::select(one_of(lm_variables), -frac_bacprod) %>%
  filter(!is.na(fracprod_per_cell_noinf)) %>%   # Remove samples that are not NA
  gather(key = independent, value = measurement, -fracprod_per_cell_noinf) %>%
  mutate(measurement = as.numeric(measurement)) %>%
  group_by(independent) %>% 
  do(glance(lm(log10(fracprod_per_cell_noinf) ~ measurement, data = .))) %>% 
  mutate(fraction = "Free", dependent = "Per-Capita Production") %>%
  dplyr::select(fraction, dependent, independent, AIC, adj.r.squared, p.value) %>%
  ungroup() %>%
  mutate(FDR.p = p.adjust(p.value, method = "fdr")) 


## Filtered PCA Linear Model Results
# Put all of the PCA and environmental  linear model results together into one dataframe 
all_pca_environ_lm_results <- 
  bind_rows(lm_summary_particle_comm, lm_summary_particle_percap,
          lm_summary_free_comm, lm_summary_free_percap,
          lm_summary_all_comm, lm_summary_all_percap) %>%
  dplyr::rename(independent_var = independent,
                dependent_var = dependent) 

## Combine the results with the diversity linear models 
individual_lm_results <- bind_rows(lm_div_results, all_pca_environ_lm_results) %>%
  mutate(AIC = round(AIC, digits = 2),  
         adj.r.squared = round(adj.r.squared, digits = 2),
         p.value = round(p.value, digits = 4),
         FDR.p = round(FDR.p, digits = 4)) %>%
  rename(FDR.p.value = FDR.p)

Table S1

tableS1 <- individual_lm_results %>%
  dplyr::filter(dependent_var == "Community Production" & FDR.p.value < 0.05) %>%  
  dplyr::select(-c(dependent_var, p.value)) 
 
#### Community-Wide Production    
datatable(tableS1,
       caption = "Table S1: Ordinary least squares regression statistics for community-wide heterotrophic production with a FDR-corrected p-value of less than 0.05.", 
       options = list(pageLength = 50), rownames = FALSE)   

Table S2

tableS2 <- individual_lm_results %>%  
  dplyr::filter(dependent_var == "Per-Capita Production" & FDR.p.value < 0.05) %>%
  dplyr::select(-c(dependent_var, p.value)) 
 
### Per-Capita Production   
datatable(tableS2,
       caption = "Table S2: Ordinary least squares regression statistics for per-capita heterotrophic production with a FDR-corrected p-value of less than 0.05.", 
       options = list(pageLength = 50), rownames = FALSE)    

All Regression Results

### Per-Capita Production     
datatable(individual_lm_results, 
       options = list(pageLength = 20), rownames = FALSE)    

Bonus Analysis

Structural Equation Modeling

colnames(lasso_data_df)
##  [1] "lakesite"                "fraction"                "season"                  "Sample_depth_m"          "Temp_C"                  "SpCond_uSpercm"          "TDS_mgperL"              "pH"                      "ORP_mV"                 
## [10] "Chl_Lab_ugperL"          "BGA_cellspermL"          "Cl_mgperL"               "SO4_mgperL"              "NO3_mgperL"              "NH3_mgperL"              "TKN_mgperL"              "SRP_ugperL"              "TP_ugperL"              
## [19] "Alk_mgperL"              "DO_mgperL"               "DO_percent"              "Turb_NTU"                "norep_filter_name"       "station"                 "total_bac_abund"         "attached_bac"            "perc_attached_abund"    
## [28] "tot_bacprod"             "SD_tot_bacprod"          "frac_bacprod"            "SD_frac_bacprod"         "fraction_bac_abund"      "fracprod_per_cell"       "fracprod_per_cell_noinf" "PC1"                     "PC2"                    
## [37] "Richness"                "Exponential_Shannon"     "Inverse_Simpson"         "Simpsons_Evenness"       "Unweighted_PD"           "Weighted_PD"
lasso_data_subset <- 
  lasso_data_df %>%
  dplyr::select(
                # Basic location info
                lakesite, fraction, season, Sample_depth_m, 
                
                # Environmental Data 
                Temp_C, SpCond_uSpercm, TDS_mgperL, pH,  ORP_mV, 
                Chl_Lab_ugperL, BGA_cellspermL, Cl_mgperL, SO4_mgperL, 
                NO3_mgperL, NH3_mgperL, TKN_mgperL, SRP_ugperL, TP_ugperL,
                Alk_mgperL, DO_mgperL, DO_percent, Turb_NTU, 
                
                # Bacterial abundances 
                total_bac_abund, attached_bac, perc_attached_abund, fraction_bac_abund,
                
                # PRODUCTION 
                tot_bacprod, frac_bacprod, fracprod_per_cell_noinf,
                
                # PC axes
                PC1, PC2, PC3, PC4, PC5, PC6, 
                
                # Biodiversity 
                Richness, Shannon_Entropy, Inverse_Simpson, Simpsons_Evenness, 
                Unweighted_PD, Weighted_PD)
## Error in .f(.x[[i]], ...): object 'PC3' not found
free_sem_data <- lasso_data_subset %>%
  dplyr::filter(fraction == "Free")
## Error in eval(lhs, parent, parent): object 'lasso_data_subset' not found
particle_sem_data <- lasso_data_subset %>%
  dplyr::filter(fraction == "Particle")
## Error in eval(lhs, parent, parent): object 'lasso_data_subset' not found
library(lavaan)

## FREE MODEL
lavaan_free_m <- 'frac_bacprod ~ frac_bacprod'
mlav_free <- sem(lavaan_free_m, data = free_sem_data, meanstructure = TRUE)
## Error in lavaan::lavaan(model = lavaan_free_m, data = free_sem_data, meanstructure = TRUE, : object 'free_sem_data' not found
summary(mlav_free)
## Error in summary(mlav_free): object 'mlav_free' not found
## Particle MODEL
lavaan_part_m <- 'frac_bacprod ~ frac_bacprod'
mlav_part <- sem(lavaan_part_m, data = particle_sem_data)
## Error in lavaan::lavaan(model = lavaan_part_m, data = particle_sem_data, : object 'particle_sem_data' not found
summary(mlav_part)
## Error in summary(mlav_part): object 'mlav_part' not found

Cross Validation

########## PUT A TABLE TOGETHER  
# Per Liter Production 
perliter_row1 <- c("Richness", "Community-Wide",
          round(summary(lm_prod_vs_rich_PA)$adj.r.squared, digits = 3),
          round(cv_lm_prod_vs_rich_PA$results$Rsquared, digits = 3), 
          round(cv_lm_prod_vs_rich_PA$results$RsquaredSD, digits = 3))

perliter_row2 <- c("Shannon_Entropy", "Community-Wide",
          round(summary(lm_prod_vs_shannon_PA)$adj.r.squared, digits = 3),
          round(cv_lm_prod_vs_shannon_PA$results$Rsquared, digits = 3), 
          round(cv_lm_prod_vs_shannon_PA$results$RsquaredSD, digits = 3))

perliter_row3 <- c("Inverse_Simpson", "Community-Wide",
          round(summary(lm_prod_vs_invsimps_PA)$adj.r.squared, digits = 3),
          round(cv_lm_prod_vs_invsimps_PA$results$Rsquared, digits = 3), 
          round(cv_lm_prod_vs_invsimps_PA$results$RsquaredSD, digits = 3))

perliter_row4 <- c("Simpsons_Evenness","Community-Wide",
          round(summary(lm_prod_vs_simpseven_PA)$adj.r.squared, digits = 3),
          round(cv_lm_prod_vs_simpseven_PA$results$Rsquared, digits = 3), 
          round(cv_lm_prod_vs_simpseven_PA$results$RsquaredSD, digits = 3))



# Per cell production 
percell_row1 <- c("Richness", "Per-Capita",
          round(summary(lm_percell_prod_vs_rich_PA)$adj.r.squared, digits = 3),
          round(cv_lm_percell_prod_vs_rich_PA$results$Rsquared, digits = 3), 
          round(cv_lm_percell_prod_vs_rich_PA$results$RsquaredSD, digits = 3))

percell_row2 <- c("Shannon Entropy", "Per-Capita",
          round(summary(lm_percell_prod_vs_shannon_PA)$adj.r.squared, digits = 3),
          round(cv_lm_percell_prod_vs_shannon_PA$results$Rsquared, digits = 3), 
          round(cv_lm_percell_prod_vs_shannon_PA$results$RsquaredSD, digits = 3))

percell_row3 <- c("Inverse Simpson", "Per-Capita",
          round(summary(lm_percell_prod_vs_invsimps_PA)$adj.r.squared, digits = 3),
          round(cv_lm_percell_prod_vs_invsimps_PA$results$Rsquared, digits = 3), 
          round(cv_lm_percell_prod_vs_invsimps_PA$results$RsquaredSD, digits = 3))

percell_row4 <- c("Simpsons Evenness", "Per-Capita",
          round(summary(lm_percell_prod_vs_simpseven_PA)$adj.r.squared, digits = 3),
          round(cv_lm_percell_prod_vs_simpseven_PA$results$Rsquared, digits = 3), 
          round(cv_lm_percell_prod_vs_simpseven_PA$results$RsquaredSD, digits = 3))

r2_table <- as.data.frame(rbind(perliter_row1, perliter_row2, perliter_row3, perliter_row4,
                                percell_row1, percell_row2, percell_row3, percell_row4))
colnames(r2_table) <- c("Diversity_Metric", "Production_Measure","Adjusted_R2","CV_R2", "CV_R2_SD")
row.names(r2_table) = NULL

datatable(r2_table,
       caption = "Table S3: \n R2 estimates for heterotrophic production vs particle-associated diversity linear regressions.", 
       options = list(pageLength = 50), rownames = FALSE)  

pH Analysis

################ Free: Per-Capita production correlation with pH
## 1. Run the linear regression 
lm_pH_percap_free <- lm(log10(fracprod_per_cell_noinf) ~ pH, data = dplyr::filter(metadata_pca, fraction == "Free"))
 
# 2. Extract the R2 and p-value from the linear model: 
lm_lab_pH_percap_free <- paste("atop(R^2 ==", round(summary(lm_pH_percap_free)$adj.r.squared, digits = 2), ",",
            "p ==", round(unname(summary(lm_pH_percap_free)$coefficients[,4][2]), digits = 5), ")")

################ Particle: Per-Capita production correlation with pH
## 3. Run the linear regression  
lm_pH_percap_part <- lm(log10(fracprod_per_cell_noinf) ~ pH, data = dplyr::filter(metadata_pca, fraction == "Particle"))

# 4. Extract the R2 and p-value from the linear model: 
lm_lab_pH_percap_part <- paste("atop(R^2 ==", round(summary(lm_pH_percap_part)$adj.r.squared, digits = 2), ",",
            "p ==", round(unname(summary(lm_pH_percap_part)$coefficients[,4][2]), digits = 3), ")")

## 5. Plot Per-Capita production correlation with pH: BOTH FRACTIONS!
plot_pH_per_cap <- ggplot(data = metadata_pca,aes(x = pH, y = log10(fracprod_per_cell_noinf))) +
  geom_point(size = 3, aes(fill = fraction, shape = season)) + 
  ylab("log10(Per-Capita Production)") + 
  scale_shape_manual(values = season_shapes) + 
  scale_fill_manual(values = fraction_colors) + 
  geom_smooth(method = "lm", color = "skyblue", fill = "skyblue", data = dplyr::filter(metadata_pca, fraction == "Free")) +
  geom_smooth(method = "lm", color = "#FF6600", fill = "#FF6600", data = dplyr::filter(metadata_pca, fraction == "Particle")) +
  # Add the R2 and p-value to the plot 
  annotate("text", y = -5.9, x=8.4, label=lm_lab_pH_percap_part, parse = TRUE, color = "#FF6600", size = 4) +
  annotate("text", y = -8, x=8.15, label=lm_lab_pH_percap_free, parse = TRUE, color = "deepskyblue3", size = 4) +
  theme(legend.title = element_blank(), legend.position = "none")  

# Do it again with community-wide production!
################ Free: Community-Wide production correlation with pH
## 1. Run the linear regression 
lm_pH_comm_free <- lm(frac_bacprod ~ pH, data = dplyr::filter(metadata_pca, fraction == "Free"))

# 2. Extract the R2 and p-value from the linear model: 
lm_lab_pH_comm_free <- paste("atop(R^2 ==", round(summary(lm_pH_comm_free)$adj.r.squared, digits = 2), ",",
            "p ==", round(unname(summary(lm_pH_comm_free)$coefficients[,4][2]), digits = 3), ")")

################ Particle: Per-Capita production correlation with pH
## 3. Run the linear regression 
lm_pH_comm_part <- lm(frac_bacprod ~ pH, data = dplyr::filter(metadata_pca, fraction == "Particle"))

# 4. Extract the R2 and p-value from the linear model: 
lm_lab_pH_percap_comm <- paste("atop(R^2 ==", round(summary(lm_pH_comm_part)$adj.r.squared, digits = 2), ",",
            "p ==", round(unname(summary(lm_pH_comm_part)$coefficients[,4][2]), digits = 3), ")")

## 5. Plot Community-Wide production correlation with pH: BOTH FRACTIONS!
plot_pH_comm <- ggplot(data = metadata_pca,aes(x = pH, y = frac_bacprod)) +
  geom_point(size = 3, aes(fill = fraction, shape = season)) + 
  ylab("Community-Wide Production") + 
  scale_shape_manual(values = season_shapes) + 
  scale_fill_manual(values = fraction_colors) + 
  geom_smooth(method = "lm", color = "skyblue", fill = "skyblue", data = dplyr::filter(metadata_pca, fraction == "Free")) +
  geom_smooth(method = "lm", color = "#FF6600", fill = "#FF6600", data = dplyr::filter(metadata_pca, fraction == "Particle")) +
  # Add the R2 and p-value to the plot 
  annotate("text", y = 0, x=8.15, label=lm_lab_pH_percap_comm, parse = TRUE, color = "#FF6600", size = 4) +
  annotate("text", y = 55, x=8.3, label=lm_lab_pH_comm_free, parse = TRUE, color = "deepskyblue3", size = 4) +
  theme(legend.title = element_blank(), legend.position = "none")  

# Put the plots together!
pH_plots <- plot_grid(plot_pH_comm, plot_pH_per_cap,labels = c("A", "B"),nrow = 1, ncol = 2)

plot_grid(pH_plots, season_legend, 
          nrow = 2, ncol = 1, rel_heights = c(1, 0.1))

Community-wide and Per-Capita Heterotrophic Production over Station and Season

# Plot the community wide heterotrophic productivity over station and season
prod1 <- ggplot(dplyr::filter(metadata, year == "2015" & fraction %in% c("Free", "Particle")), 
       aes(x = lakesite, y = frac_bacprod, fill = fraction)) + 
  geom_bar(stat = "identity", color = "black",  position=position_dodge()) + 
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod), 
                position = position_dodge(width = 0.75), width = 0.25) +
  facet_grid(. ~ season) + 
  scale_y_continuous(expand = c(0,0), limits = c(0, 80), 
                     breaks = seq(from = 0, to = 80, by = 15)) +
  scale_fill_manual(values = fraction_colors) + 
  guides(fill=guide_legend(nrow=1,byrow=TRUE)) +
  xlab("Lake Station") + ylab("Community Production\n(μg C/L/day)") + 
  theme(axis.text.x = element_blank(), #element_text(angle = 30, vjust = 1, hjust = 1),
        axis.title.x = element_blank(),
        legend.position = c(0.65, 0.9), legend.title = element_blank(),
        plot.margin = unit(c(0.25, 0.25, 0.25, 0.25), "cm")) #top, right, bottom, and left margins)

# Plot the per-capita heterotrophic productivity over station and season
prod2 <- ggplot(dplyr::filter(metadata, year == "2015" & fraction %in% c("Free", "Particle")), 
       aes(x = lakesite, y = fracprod_per_cell_noinf, fill = fraction)) + 
  geom_bar(stat = "identity", color = "black",  position=position_dodge()) + 
  facet_grid(. ~ season) +
  scale_y_continuous(expand = c(0,0)) +
  scale_fill_manual(values = fraction_colors) + 
  guides(fill=guide_legend(nrow=1,byrow=TRUE)) +
  xlab("Lake Station") + ylab("Per-Capita Production \n(ug C/cell/day)") + 
  theme(axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1),
        legend.position = c(0.65, 0.9), legend.title = element_blank(),
        plot.margin = unit(c(0.25, 0.25, 0.25, 0.5), "cm")) #top, right, bottom, and left margins)

# Put the two plots together 
plot_grid(prod1, prod2, labels = c("A", "B"),
          rel_heights = c(1, 1.3),
          nrow = 2, ncol = 1,
          align = "v")

Total Production

### Total production by station 
prod_by_station <- ggplot(dplyr::filter(metadata, year == "2015" & fraction %in% c("Free")), 
       aes(x = lakesite, y = tot_bacprod)) + 
  geom_boxplot(fill = "grey") + geom_point(size = 3.5, position = position_jitter()) +
  scale_y_continuous(expand = c(0,0), limits = c(0,100)) +
  scale_fill_manual(values = fraction_colors) + 
  guides(fill=guide_legend(nrow=1,byrow=TRUE)) +
  xlab("Lake Station") + ylab("Total Production \n (μg C/cell/day)") + 
  theme(#axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1),
        legend.position = c(0.85, 0.9), legend.title = element_blank(),
        plot.margin = unit(c(0.25, 0.25, 0.25, 0.5), "cm")) #top, right, bottom, and left margins)

### Total production by Season 
prod_by_season <- ggplot(dplyr::filter(metadata, year == "2015" & fraction %in% c("Free")), 
       aes(x = season, y = tot_bacprod)) + 
  geom_boxplot(fill = "grey", outlier.shape = NA) + geom_point(size = 3.5, position = position_jitter()) +
  scale_y_continuous(expand = c(0,0), limits = c(0,100)) +
  scale_fill_manual(values = fraction_colors) + 
  guides(fill=guide_legend(nrow=1,byrow=TRUE)) +
  xlab("Season") + ylab("") + 
  theme(#axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1),
        legend.position = c(0.85, 0.9), legend.title = element_blank(),
        plot.margin = unit(c(0.25, 0.25, 0.25, 0.5), "cm")) #top, right, bottom, and left margins)

plot_grid(prod_by_station, prod_by_season, labels = c("A", "B"),
          rel_widths = c(1, 0.8),
          nrow = 1, ncol = 2,
          align = "h")

#ggsave("station-season.png", width = 6.5, height = 3.5)

The highest total heterotrophic production rates occur in the river station and in the spring and summer.

Multiple Regression with PCA results

## Community-wide: Free living with PC1 + PC2 = Not significant
summary(lm(frac_bacprod ~ PC1 + PC2, data = filter(metadata_pca, fraction == "Free")))
## 
## Call:
## lm(formula = frac_bacprod ~ PC1 + PC2, data = filter(metadata_pca, 
##     fraction == "Free"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.054  -9.327  -7.554  10.981  29.901 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   24.058      4.954   4.857   0.0009 ***
## PC1            6.295      4.871   1.292   0.2284    
## PC2           -4.385      4.871  -0.900   0.3914    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.16 on 9 degrees of freedom
## Multiple R-squared:  0.2161, Adjusted R-squared:  0.04186 
## F-statistic:  1.24 on 2 and 9 DF,  p-value: 0.3344
## Community-wide:Particle-associated with PC1 + PC2 = Not significant
summary(lm(frac_bacprod ~ PC1 + PC2, data = filter(metadata_pca, fraction == "Particle")))
## 
## Call:
## lm(formula = frac_bacprod ~ PC1 + PC2, data = filter(metadata_pca, 
##     fraction == "Particle"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.481  -3.757   0.410   3.213  15.482 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    9.958      2.129   4.678  0.00115 **
## PC1            1.107      2.093   0.529  0.60954   
## PC2           -4.424      2.093  -2.114  0.06368 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.373 on 9 degrees of freedom
## Multiple R-squared:  0.3454, Adjusted R-squared:  0.1999 
## F-statistic: 2.374 on 2 and 9 DF,  p-value: 0.1486
## PER CAPITA: Free living with PC1 + PC2 = Not significant
summary(lm(log10(fracprod_per_cell_noinf) ~ PC1 + PC2, data = filter(metadata_pca, fraction == "Free")))
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ PC1 + PC2, data = filter(metadata_pca, 
##     fraction == "Free"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5064 -0.2177 -0.1287  0.2483  0.4932 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -7.5750     0.1047 -72.357  9.3e-14 ***
## PC1           0.1199     0.1029   1.165    0.274    
## PC2          -0.1679     0.1029  -1.631    0.137    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3627 on 9 degrees of freedom
## Multiple R-squared:  0.3087, Adjusted R-squared:  0.1551 
## F-statistic: 2.009 on 2 and 9 DF,  p-value: 0.1899
## PER CAPITA: Particle-associatedwith PC1 + PC2  = SIGNIFICANT (p = 0.015)
lm_percap_PC12_part <- lm(log10(fracprod_per_cell_noinf) ~ PC1 + PC2, data = filter(metadata_pca, fraction == "Particle"))
summary(lm_percap_PC12_part)
## 
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ PC1 + PC2, data = filter(metadata_pca, 
##     fraction == "Particle"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.53476 -0.15558 -0.00926  0.08264  0.65269 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6.6657     0.1074 -62.056 5.05e-12 ***
## PC1           0.2026     0.1060   1.912  0.09229 .  
## PC2          -0.3818     0.1075  -3.553  0.00747 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3522 on 8 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6498, Adjusted R-squared:  0.5623 
## F-statistic: 7.423 on 2 and 8 DF,  p-value: 0.01504

Important! In a multiple regression, PC1 and PC2 correlate with log10(per-capita heterotrophic production)!

Environmental Variables Pairs

pairs(environmental_data)

Session Information

devtools::session_info() # This will include session info with all R package version information
## ─ Session info ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
##  setting  value                       
##  version  R version 3.6.2 (2019-12-12)
##  os       macOS Mojave 10.14.6        
##  system   x86_64, darwin15.6.0        
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  ctype    en_US.UTF-8                 
##  tz       America/Chicago             
##  date     2020-01-02                  
## 
## ─ Packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
##  package      * version    date       lib source                           
##  abind          1.4-5      2016-07-21 [1] CRAN (R 3.6.0)                   
##  ade4           1.7-13     2018-08-31 [1] CRAN (R 3.6.0)                   
##  ape          * 5.3        2019-03-17 [1] CRAN (R 3.6.0)                   
##  assertthat     0.2.1      2019-03-21 [1] CRAN (R 3.6.0)                   
##  backports      1.1.4      2019-04-10 [1] CRAN (R 3.6.0)                   
##  Biobase        2.44.0     2019-05-02 [1] Bioconductor                     
##  BiocGenerics   0.30.0     2019-05-02 [1] Bioconductor                     
##  biomformat     1.12.0     2019-05-02 [1] Bioconductor                     
##  Biostrings     2.52.0     2019-05-02 [1] Bioconductor                     
##  bitops         1.0-6      2013-08-17 [1] CRAN (R 3.6.0)                   
##  boot           1.3-23     2019-07-05 [1] CRAN (R 3.6.2)                   
##  broom        * 0.5.2      2019-04-07 [1] CRAN (R 3.6.0)                   
##  callr          3.3.0      2019-07-04 [1] CRAN (R 3.6.0)                   
##  car          * 3.0-3      2019-05-27 [1] CRAN (R 3.6.0)                   
##  carData      * 3.0-2      2018-09-30 [1] CRAN (R 3.6.0)                   
##  caret        * 6.0-84     2019-04-27 [1] CRAN (R 3.6.0)                   
##  cellranger     1.1.0      2016-07-27 [1] CRAN (R 3.6.0)                   
##  class          7.3-15     2019-01-01 [1] CRAN (R 3.6.2)                   
##  cli            1.1.0      2019-03-19 [1] CRAN (R 3.6.0)                   
##  cluster        2.1.0      2019-06-19 [1] CRAN (R 3.6.2)                   
##  codetools      0.2-16     2018-12-24 [1] CRAN (R 3.6.2)                   
##  colorspace     1.4-1      2019-03-18 [1] CRAN (R 3.6.0)                   
##  cowplot      * 0.9.4      2019-01-08 [1] CRAN (R 3.6.0)                   
##  crayon         1.3.4      2017-09-16 [1] CRAN (R 3.6.0)                   
##  crosstalk      1.0.0      2016-12-21 [1] CRAN (R 3.6.0)                   
##  curl           3.3        2019-01-10 [1] CRAN (R 3.6.0)                   
##  data.table     1.12.2     2019-04-07 [1] CRAN (R 3.6.0)                   
##  DBI            1.0.0      2018-05-02 [1] CRAN (R 3.6.0)                   
##  dbplyr         1.4.2      2019-06-17 [1] CRAN (R 3.6.0)                   
##  desc           1.2.0      2018-05-01 [1] CRAN (R 3.6.0)                   
##  devtools     * 2.2.1      2019-09-24 [1] CRAN (R 3.6.0)                   
##  digest         0.6.20     2019-07-04 [1] CRAN (R 3.6.0)                   
##  dplyr        * 0.8.3      2019-07-04 [1] CRAN (R 3.6.0)                   
##  DT           * 0.7        2019-06-11 [1] CRAN (R 3.6.0)                   
##  ellipsis       0.3.0      2019-09-20 [1] CRAN (R 3.6.0)                   
##  evaluate       0.14       2019-05-28 [1] CRAN (R 3.6.0)                   
##  fansi          0.4.0      2018-10-05 [1] CRAN (R 3.6.0)                   
##  forcats      * 0.4.0      2019-02-17 [1] CRAN (R 3.6.0)                   
##  foreach      * 1.4.4      2017-12-12 [1] CRAN (R 3.6.0)                   
##  foreign        0.8-72     2019-08-02 [1] CRAN (R 3.6.2)                   
##  fs             1.3.1      2019-05-06 [1] CRAN (R 3.6.0)                   
##  generics       0.0.2      2018-11-29 [1] CRAN (R 3.6.0)                   
##  ggmap        * 3.0.0      2019-02-05 [1] CRAN (R 3.6.0)                   
##  ggplot2      * 3.2.0      2019-06-16 [1] CRAN (R 3.6.0)                   
##  glmnet       * 2.0-18     2019-05-20 [1] CRAN (R 3.6.0)                   
##  glue           1.3.1      2019-03-12 [1] CRAN (R 3.6.0)                   
##  gower          0.2.1      2019-05-14 [1] CRAN (R 3.6.0)                   
##  gtable         0.3.0      2019-03-25 [1] CRAN (R 3.6.0)                   
##  haven          2.1.1      2019-07-04 [1] CRAN (R 3.6.0)                   
##  hms            0.5.0      2019-07-09 [1] CRAN (R 3.6.0)                   
##  htmltools      0.3.6      2017-04-28 [1] CRAN (R 3.6.0)                   
##  htmlwidgets    1.3        2018-09-30 [1] CRAN (R 3.6.0)                   
##  httpuv         1.5.1      2019-04-05 [1] CRAN (R 3.6.0)                   
##  httr           1.4.0      2018-12-11 [1] CRAN (R 3.6.0)                   
##  igraph         1.2.4.1    2019-04-22 [1] CRAN (R 3.6.0)                   
##  ipred          0.9-9      2019-04-28 [1] CRAN (R 3.6.0)                   
##  IRanges        2.18.1     2019-05-31 [1] Bioconductor                     
##  iterators      1.0.10     2018-07-13 [1] CRAN (R 3.6.0)                   
##  jpeg           0.1-8      2014-01-23 [1] CRAN (R 3.6.0)                   
##  jsonlite       1.6        2018-12-07 [1] CRAN (R 3.6.0)                   
##  knitr          1.23       2019-05-18 [1] CRAN (R 3.6.0)                   
##  labeling       0.3        2014-08-23 [1] CRAN (R 3.6.0)                   
##  later          0.8.0      2019-02-11 [1] CRAN (R 3.6.0)                   
##  lattice      * 0.20-38    2018-11-04 [1] CRAN (R 3.6.2)                   
##  lava           1.6.5      2019-02-12 [1] CRAN (R 3.6.0)                   
##  lavaan       * 0.6-5      2019-08-28 [1] CRAN (R 3.6.0)                   
##  lazyeval       0.2.2      2019-03-15 [1] CRAN (R 3.6.0)                   
##  lme4         * 1.1-21     2019-03-05 [1] CRAN (R 3.6.0)                   
##  lubridate      1.7.4      2018-04-11 [1] CRAN (R 3.6.0)                   
##  magrittr       1.5        2014-11-22 [1] CRAN (R 3.6.0)                   
##  MASS         * 7.3-51.4   2019-03-31 [1] CRAN (R 3.6.2)                   
##  Matrix       * 1.2-18     2019-11-27 [1] CRAN (R 3.6.2)                   
##  memoise        1.1.0      2017-04-21 [1] CRAN (R 3.6.0)                   
##  mgcv           1.8-31     2019-11-09 [1] CRAN (R 3.6.2)                   
##  mime           0.7        2019-06-11 [1] CRAN (R 3.6.0)                   
##  minqa          1.2.4      2014-10-09 [1] CRAN (R 3.6.0)                   
##  mnormt         1.5-5      2016-10-15 [1] CRAN (R 3.6.0)                   
##  ModelMetrics   1.2.2      2018-11-03 [1] CRAN (R 3.6.0)                   
##  modelr         0.1.4      2019-02-18 [1] CRAN (R 3.6.0)                   
##  multtest       2.40.0     2019-05-02 [1] Bioconductor                     
##  munsell        0.5.0      2018-06-12 [1] CRAN (R 3.6.0)                   
##  nlme         * 3.1-142    2019-11-07 [1] CRAN (R 3.6.2)                   
##  nloptr         1.2.1      2018-10-03 [1] CRAN (R 3.6.0)                   
##  nnet           7.3-12     2016-02-02 [1] CRAN (R 3.6.2)                   
##  openxlsx       4.1.0.1    2019-05-28 [1] CRAN (R 3.6.0)                   
##  pander       * 0.6.3      2018-11-06 [1] CRAN (R 3.6.0)                   
##  pbivnorm       0.6.0      2015-01-23 [1] CRAN (R 3.6.0)                   
##  permute      * 0.9-5      2019-03-12 [1] CRAN (R 3.6.0)                   
##  phyloseq     * 1.28.0     2019-05-02 [1] Bioconductor                     
##  picante      * 1.8        2019-03-21 [1] CRAN (R 3.6.0)                   
##  pillar         1.4.2      2019-06-29 [1] CRAN (R 3.6.0)                   
##  pkgbuild       1.0.3      2019-03-20 [1] CRAN (R 3.6.0)                   
##  pkgconfig      2.0.2      2018-08-16 [1] CRAN (R 3.6.0)                   
##  pkgload        1.0.2      2018-10-29 [1] CRAN (R 3.6.0)                   
##  plyr           1.8.4      2016-06-08 [1] CRAN (R 3.6.0)                   
##  png            0.1-7      2013-12-03 [1] CRAN (R 3.6.0)                   
##  prettyunits    1.0.2      2015-07-13 [1] CRAN (R 3.6.0)                   
##  processx       3.4.0      2019-07-03 [1] CRAN (R 3.6.0)                   
##  prodlim        2018.04.18 2018-04-18 [1] CRAN (R 3.6.0)                   
##  promises       1.0.1      2018-04-13 [1] CRAN (R 3.6.0)                   
##  ps             1.3.0      2018-12-21 [1] CRAN (R 3.6.0)                   
##  purrr        * 0.3.2      2019-03-15 [1] CRAN (R 3.6.0)                   
##  R6             2.4.0      2019-02-14 [1] CRAN (R 3.6.0)                   
##  Rcpp           1.0.1      2019-03-17 [1] CRAN (R 3.6.0)                   
##  readr        * 1.3.1      2018-12-21 [1] CRAN (R 3.6.0)                   
##  readxl         1.3.1      2019-03-13 [1] CRAN (R 3.6.0)                   
##  recipes        0.1.6      2019-07-02 [1] CRAN (R 3.6.0)                   
##  remotes        2.1.0      2019-06-24 [1] CRAN (R 3.6.0)                   
##  reprex         0.3.0      2019-05-16 [1] CRAN (R 3.6.0)                   
##  reshape2       1.4.3      2017-12-11 [1] CRAN (R 3.6.0)                   
##  RgoogleMaps    1.4.4      2019-08-20 [1] CRAN (R 3.6.0)                   
##  rhdf5          2.28.0     2019-05-02 [1] Bioconductor                     
##  Rhdf5lib       1.6.0      2019-05-02 [1] Bioconductor                     
##  rio            0.5.16     2018-11-26 [1] CRAN (R 3.6.0)                   
##  rjson          0.2.20     2018-06-08 [1] CRAN (R 3.6.0)                   
##  rlang          0.4.0      2019-06-25 [1] CRAN (R 3.6.0)                   
##  rmarkdown      1.13       2019-05-22 [1] CRAN (R 3.6.0)                   
##  rpart          4.1-15     2019-04-12 [1] CRAN (R 3.6.2)                   
##  rprojroot      1.3-2      2018-01-03 [1] CRAN (R 3.6.0)                   
##  rstudioapi     0.10       2019-03-19 [1] CRAN (R 3.6.0)                   
##  rvest          0.3.4      2019-05-15 [1] CRAN (R 3.6.0)                   
##  S4Vectors      0.22.0     2019-05-02 [1] Bioconductor                     
##  sandwich     * 2.5-1      2019-04-06 [1] CRAN (R 3.6.0)                   
##  scales         1.0.0      2018-08-09 [1] CRAN (R 3.6.0)                   
##  sessioninfo    1.1.1      2018-11-05 [1] CRAN (R 3.6.0)                   
##  shiny          1.3.2      2019-04-22 [1] CRAN (R 3.6.0)                   
##  stringi        1.4.3      2019-03-12 [1] CRAN (R 3.6.0)                   
##  stringr      * 1.4.0      2019-02-10 [1] CRAN (R 3.6.0)                   
##  survival       3.1-8      2019-12-03 [1] CRAN (R 3.6.2)                   
##  testthat       2.1.1      2019-04-23 [1] CRAN (R 3.6.0)                   
##  tibble       * 2.1.3      2019-06-06 [1] CRAN (R 3.6.0)                   
##  tidyr        * 0.8.3      2019-03-01 [1] CRAN (R 3.6.0)                   
##  tidyselect     0.2.5      2018-10-11 [1] CRAN (R 3.6.0)                   
##  tidyverse    * 1.2.1.9000 2019-07-03 [1] Github (hadley/tidyverse@53a3146)
##  timeDate       3043.102   2018-02-21 [1] CRAN (R 3.6.0)                   
##  usethis      * 1.5.1      2019-07-04 [1] CRAN (R 3.6.0)                   
##  utf8           1.1.4      2018-05-24 [1] CRAN (R 3.6.0)                   
##  vctrs          0.2.0      2019-07-05 [1] CRAN (R 3.6.0)                   
##  vegan        * 2.5-5      2019-05-12 [1] CRAN (R 3.6.0)                   
##  withr          2.1.2      2018-03-15 [1] CRAN (R 3.6.0)                   
##  xfun           0.8        2019-06-25 [1] CRAN (R 3.6.0)                   
##  xml2           1.2.0      2018-01-24 [1] CRAN (R 3.6.0)                   
##  xtable         1.8-4      2019-04-21 [1] CRAN (R 3.6.0)                   
##  XVector        0.24.0     2019-05-02 [1] Bioconductor                     
##  yaml           2.2.0      2018-07-25 [1] CRAN (R 3.6.0)                   
##  zeallot        0.1.0      2018-01-28 [1] CRAN (R 3.6.0)                   
##  zip            2.0.3      2019-07-03 [1] CRAN (R 3.6.0)                   
##  zlibbioc       1.30.0     2019-05-02 [1] Bioconductor                     
##  zoo            1.8-6      2019-05-28 [1] CRAN (R 3.6.0)                   
## 
## [1] /Library/Frameworks/R.framework/Versions/3.6/Resources/library